
SDSoCやVivado HLSにしてもプラグマを入れることで性能を左右することができる。



#define HEIGHT 100
#define WIDTH 100

int function(float *dst, float *din, float *weight)
  int y, x;
  int offset;

  for(y = 0; y < HEIGHT; ++y){
    for(x = 0; x < WIDTH; ++x){
      offset = y * HEIGHT + x;
      dst[offset] = din[offset] * weight[offset];

  return 0;



clang -S -O0 -emit-llvm function.c


; ModuleID = 'function.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: nounwind uwtable
define i32 @function(float* %dst, float* %din, float* %weight) #0 {
  %1 = alloca float*, align 8
  %2 = alloca float*, align 8
  %3 = alloca float*, align 8
  %y = alloca i32, align 4
  %x = alloca i32, align 4
  %offset = alloca i32, align 4
  store float* %dst, float** %1, align 8
  store float* %din, float** %2, align 8
  store float* %weight, float** %3, align 8
  store i32 0, i32* %y, align 4
  br label %4

; <label>:4                                       ; preds = %35, %0
  %5 = load i32, i32* %y, align 4
  %6 = icmp slt i32 %5, 100
  br i1 %6, label %7, label %38

; <label>:7                                       ; preds = %4
  store i32 0, i32* %x, align 4
  br label %8

; <label>:8                                       ; preds = %31, %7
  %9 = load i32, i32* %x, align 4
  %10 = icmp slt i32 %9, 100
  br i1 %10, label %11, label %34

; <label>:11                                      ; preds = %8
  %12 = load i32, i32* %y, align 4
  %13 = mul nsw i32 %12, 100
  %14 = load i32, i32* %x, align 4
  %15 = add nsw i32 %13, %14
  store i32 %15, i32* %offset, align 4
  %16 = load i32, i32* %offset, align 4
  %17 = sext i32 %16 to i64
  %18 = load float*, float** %2, align 8
  %19 = getelementptr inbounds float, float* %18, i64 %17
  %20 = load float, float* %19, align 4
  %21 = load i32, i32* %offset, align 4
  %22 = sext i32 %21 to i64
  %23 = load float*, float** %3, align 8
  %24 = getelementptr inbounds float, float* %23, i64 %22
  %25 = load float, float* %24, align 4
  %26 = fmul float %20, %25
  %27 = load i32, i32* %offset, align 4
  %28 = sext i32 %27 to i64
  %29 = load float*, float** %1, align 8
  %30 = getelementptr inbounds float, float* %29, i64 %28
  store float %26, float* %30, align 4
  br label %31

; <label>:31                                      ; preds = %11
  %32 = load i32, i32* %x, align 4
  %33 = add nsw i32 %32, 1
  store i32 %33, i32* %x, align 4
  br label %8

; <label>:34                                      ; preds = %8
  br label %35

; <label>:35                                      ; preds = %34
  %36 = load i32, i32* %y, align 4
  %37 = add nsw i32 %36, 1
  store i32 %37, i32* %y, align 4
  br label %4

; <label>:38                                      ; preds = %4
  ret i32 0

attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}





opt -S -O1 -o function.opt1.ll function.ll


; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
  br label %.preheader

.preheader:                                       ; preds = %31, %0
  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %31 ]
  %1 = mul nuw nsw i64 %indvars.iv3, 100
  br label %2

; <label>:2                                       ; preds = %2, %.preheader
  %indvars.iv = phi i64 [ 0, %.preheader ], [, %2 ]
  %3 = add nuw nsw i64 %indvars.iv, %1
  %4 = getelementptr inbounds float, float* %din, i64 %3
  %5 = load float, float* %4, align 4
  %6 = getelementptr inbounds float, float* %weight, i64 %3
  %7 = load float, float* %6, align 4
  %8 = fmul float %5, %7
  %9 = getelementptr inbounds float, float* %dst, i64 %3
  store float %8, float* %9, align 4 = or i64 %indvars.iv, 1
  %10 = add nuw nsw i64, %1
  %11 = getelementptr inbounds float, float* %din, i64 %10
  %12 = load float, float* %11, align 4
  %13 = getelementptr inbounds float, float* %weight, i64 %10
  %14 = load float, float* %13, align 4
  %15 = fmul float %12, %14
  %16 = getelementptr inbounds float, float* %dst, i64 %10
  store float %15, float* %16, align 4 = or i64 %indvars.iv, 2
  %17 = add nuw nsw i64, %1
  %18 = getelementptr inbounds float, float* %din, i64 %17
  %19 = load float, float* %18, align 4
  %20 = getelementptr inbounds float, float* %weight, i64 %17
  %21 = load float, float* %20, align 4
  %22 = fmul float %19, %21
  %23 = getelementptr inbounds float, float* %dst, i64 %17
  store float %22, float* %23, align 4 = or i64 %indvars.iv, 3
  %24 = add nuw nsw i64, %1
  %25 = getelementptr inbounds float, float* %din, i64 %24
  %26 = load float, float* %25, align 4
  %27 = getelementptr inbounds float, float* %weight, i64 %24
  %28 = load float, float* %27, align 4
  %29 = fmul float %26, %28
  %30 = getelementptr inbounds float, float* %dst, i64 %24
  store float %29, float* %30, align 4 = add nsw i64 %indvars.iv, 4
  %exitcond.3 = icmp eq i64, 100
  br i1 %exitcond.3, label %31, label %2

; <label>:31                                      ; preds = %2
  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
  %exitcond5 = icmp eq i64 %indvars.iv.next4, 100
  br i1 %exitcond5, label %32, label %.preheader

; <label>:32                                      ; preds = %31
  ret i32 0

attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}




opt -S -O1 -o function.opt2.ll function.ll


; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
  br label %.preheader

.preheader:                                       ; preds = %middle.block, %0
  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %middle.block ]
  %1 = mul i64 %indvars.iv3, 100
  %scevgep = getelementptr float, float* %dst, i64 %1
  %2 = add i64 %1, 99
  %scevgep7 = getelementptr float, float* %dst, i64 %2
  %scevgep9 = getelementptr float, float* %din, i64 %1
  %scevgep11 = getelementptr float, float* %din, i64 %2
  %scevgep13 = getelementptr float, float* %weight, i64 %1
  %scevgep15 = getelementptr float, float* %weight, i64 %2
  %3 = mul nuw nsw i64 %indvars.iv3, 100
  %bound0 = icmp ule float* %scevgep, %scevgep11
  %bound1 = icmp ule float* %scevgep9, %scevgep7
  %found.conflict = and i1 %bound0, %bound1
  %bound017 = icmp ule float* %scevgep, %scevgep15
  %bound118 = icmp ule float* %scevgep13, %scevgep7
  %found.conflict19 = and i1 %bound017, %bound118
  %conflict.rdx = or i1 %found.conflict, %found.conflict19
  br i1 %conflict.rdx, label, label %vector.body.preheader

vector.body.preheader:                            ; preds = %.preheader
  br label %vector.body                              ; preds = %.preheader
  br label

vector.body:                                      ; preds = %vector.body.preheader, %vector.body
  %index = phi i64 [, %vector.body ], [ 0, %vector.body.preheader ]
  %4 = add i64 %index, %3
  %5 = getelementptr inbounds float, float* %din, i64 %4
  %6 = bitcast float* %5 to <4 x float>*
  %wide.load = load <4 x float>, <4 x float>* %6, align 4
  %7 = getelementptr inbounds float, float* %weight, i64 %4
  %8 = bitcast float* %7 to <4 x float>*
  %wide.load22 = load <4 x float>, <4 x float>* %8, align 4
  %9 = fmul <4 x float> %wide.load, %wide.load22
  %10 = getelementptr inbounds float, float* %dst, i64 %4
  %11 = bitcast float* %10 to <4 x float>*
  store <4 x float> %9, <4 x float>* %11, align 4 = add i64 %index, 4
  %12 = icmp eq i64, 100
  br i1 %12, label %middle.block.loopexit23, label %vector.body, !llvm.loop !1                                        ; preds =,
  %indvars.iv = phi i64 [ 0, ], [, ]
  %13 = add nuw nsw i64 %indvars.iv, %3
  %14 = getelementptr inbounds float, float* %din, i64 %13
  %15 = load float, float* %14, align 4
  %16 = getelementptr inbounds float, float* %weight, i64 %13
  %17 = load float, float* %16, align 4
  %18 = fmul float %15, %17
  %19 = getelementptr inbounds float, float* %dst, i64 %13
  store float %18, float* %19, align 4 = or i64 %indvars.iv, 1
  %20 = add nuw nsw i64, %3
  %21 = getelementptr inbounds float, float* %din, i64 %20
  %22 = load float, float* %21, align 4
  %23 = getelementptr inbounds float, float* %weight, i64 %20
  %24 = load float, float* %23, align 4
  %25 = fmul float %22, %24
  %26 = getelementptr inbounds float, float* %dst, i64 %20
  store float %25, float* %26, align 4 = or i64 %indvars.iv, 2
  %27 = add nuw nsw i64, %3
  %28 = getelementptr inbounds float, float* %din, i64 %27
  %29 = load float, float* %28, align 4
  %30 = getelementptr inbounds float, float* %weight, i64 %27
  %31 = load float, float* %30, align 4
  %32 = fmul float %29, %31
  %33 = getelementptr inbounds float, float* %dst, i64 %27
  store float %32, float* %33, align 4 = or i64 %indvars.iv, 3
  %34 = add nuw nsw i64, %3
  %35 = getelementptr inbounds float, float* %din, i64 %34
  %36 = load float, float* %35, align 4
  %37 = getelementptr inbounds float, float* %weight, i64 %34
  %38 = load float, float* %37, align 4
  %39 = fmul float %36, %38
  %40 = getelementptr inbounds float, float* %dst, i64 %34
  store float %39, float* %40, align 4 = add nsw i64 %indvars.iv, 4
  %exitcond.3 = icmp eq i64, 100
  br i1 %exitcond.3, label %middle.block.loopexit, label, !llvm.loop !4

middle.block.loopexit:                            ; preds =
  br label %middle.block

middle.block.loopexit23:                          ; preds = %vector.body
  br label %middle.block

middle.block:                                     ; preds = %middle.block.loopexit23, %middle.block.loopexit
  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
  %exitcond5 = icmp eq i64 %indvars.iv.next4, 100
  br i1 %exitcond5, label %41, label %.preheader

; <label>:41                                      ; preds = %middle.block
  ret i32 0

attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 1}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = distinct !{!4, !2, !3}

-O2のオプティマイズでは.preheader:が元のソースコードのyのforループ部分である。 そして、注目スべき点はxのループがvector.body:とラベルの2つのループで構成された。 ラベルの通り、ベクター処理かスカラー処理である。




opt -S -O3 -o function.opt2.ll function.ll



