SDSoCの高位合成について

SDSoCやVivado HLSにしてもプラグマを入れることで性能を左右することができる。

では、プラグマを指定してどこで性能を変化するように構成建てるのだろうか？

次のようなforループがあったとする。

#define HEIGHT 100
#define WIDTH 100

int function(float *dst, float *din, float *weight)
{
  int y, x;
  int offset;

  for(y = 0; y < HEIGHT; ++y){
    for(x = 0; x < WIDTH; ++x){
      offset = y * HEIGHT + x;
      dst[offset] = din[offset] * weight[offset];
    }
  }

  return 0;
}

LLVM-IRへ変換する

clangでLLVM-IRを出力してみよう。

clang -S -O0 -emit-llvm function.c

次のような結果が出力される。

; ModuleID = 'function.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: nounwind uwtable
define i32 @function(float* %dst, float* %din, float* %weight) #0 {
  %1 = alloca float*, align 8
  %2 = alloca float*, align 8
  %3 = alloca float*, align 8
  %y = alloca i32, align 4
  %x = alloca i32, align 4
  %offset = alloca i32, align 4
  store float* %dst, float** %1, align 8
  store float* %din, float** %2, align 8
  store float* %weight, float** %3, align 8
  store i32 0, i32* %y, align 4
  br label %4

; <label>:4                                       ; preds = %35, %0
  %5 = load i32, i32* %y, align 4
  %6 = icmp slt i32 %5, 100
  br i1 %6, label %7, label %38

; <label>:7                                       ; preds = %4
  store i32 0, i32* %x, align 4
  br label %8

; <label>:8                                       ; preds = %31, %7
  %9 = load i32, i32* %x, align 4
  %10 = icmp slt i32 %9, 100
  br i1 %10, label %11, label %34

; <label>:11                                      ; preds = %8
  %12 = load i32, i32* %y, align 4
  %13 = mul nsw i32 %12, 100
  %14 = load i32, i32* %x, align 4
  %15 = add nsw i32 %13, %14
  store i32 %15, i32* %offset, align 4
  %16 = load i32, i32* %offset, align 4
  %17 = sext i32 %16 to i64
  %18 = load float*, float** %2, align 8
  %19 = getelementptr inbounds float, float* %18, i64 %17
  %20 = load float, float* %19, align 4
  %21 = load i32, i32* %offset, align 4
  %22 = sext i32 %21 to i64
  %23 = load float*, float** %3, align 8
  %24 = getelementptr inbounds float, float* %23, i64 %22
  %25 = load float, float* %24, align 4
  %26 = fmul float %20, %25
  %27 = load i32, i32* %offset, align 4
  %28 = sext i32 %27 to i64
  %29 = load float*, float** %1, align 8
  %30 = getelementptr inbounds float, float* %29, i64 %28
  store float %26, float* %30, align 4
  br label %31

; <label>:31                                      ; preds = %11
  %32 = load i32, i32* %x, align 4
  %33 = add nsw i32 %32, 1
  store i32 %33, i32* %x, align 4
  br label %8

; <label>:34                                      ; preds = %8
  br label %35

; <label>:35                                      ; preds = %34
  %36 = load i32, i32* %y, align 4
  %37 = add nsw i32 %36, 1
  store i32 %37, i32* %y, align 4
  br label %4

; <label>:38                                      ; preds = %4
  ret i32 0
}

attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}

読みづらいかもしれないが大雑把におおよその流れはわかるだろう。

愚直に実行していることが読み取れる。

オプティマイズ：-O1

ここからオプティマイズをかけてみよう。

opt -S -O1 -o function.opt1.ll function.ll

まずは-O1の結果から見ていく。

; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
  br label %.preheader

.preheader:                                       ; preds = %31, %0
  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %31 ]
  %1 = mul nuw nsw i64 %indvars.iv3, 100
  br label %2

; <label>:2                                       ; preds = %2, %.preheader
  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next.3, %2 ]
  %3 = add nuw nsw i64 %indvars.iv, %1
  %4 = getelementptr inbounds float, float* %din, i64 %3
  %5 = load float, float* %4, align 4
  %6 = getelementptr inbounds float, float* %weight, i64 %3
  %7 = load float, float* %6, align 4
  %8 = fmul float %5, %7
  %9 = getelementptr inbounds float, float* %dst, i64 %3
  store float %8, float* %9, align 4
  %indvars.iv.next = or i64 %indvars.iv, 1
  %10 = add nuw nsw i64 %indvars.iv.next, %1
  %11 = getelementptr inbounds float, float* %din, i64 %10
  %12 = load float, float* %11, align 4
  %13 = getelementptr inbounds float, float* %weight, i64 %10
  %14 = load float, float* %13, align 4
  %15 = fmul float %12, %14
  %16 = getelementptr inbounds float, float* %dst, i64 %10
  store float %15, float* %16, align 4
  %indvars.iv.next.1 = or i64 %indvars.iv, 2
  %17 = add nuw nsw i64 %indvars.iv.next.1, %1
  %18 = getelementptr inbounds float, float* %din, i64 %17
  %19 = load float, float* %18, align 4
  %20 = getelementptr inbounds float, float* %weight, i64 %17
  %21 = load float, float* %20, align 4
  %22 = fmul float %19, %21
  %23 = getelementptr inbounds float, float* %dst, i64 %17
  store float %22, float* %23, align 4
  %indvars.iv.next.2 = or i64 %indvars.iv, 3
  %24 = add nuw nsw i64 %indvars.iv.next.2, %1
  %25 = getelementptr inbounds float, float* %din, i64 %24
  %26 = load float, float* %25, align 4
  %27 = getelementptr inbounds float, float* %weight, i64 %24
  %28 = load float, float* %27, align 4
  %29 = fmul float %26, %28
  %30 = getelementptr inbounds float, float* %dst, i64 %24
  store float %29, float* %30, align 4
  %indvars.iv.next.3 = add nsw i64 %indvars.iv, 4
  %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 100
  br i1 %exitcond.3, label %31, label %2

; <label>:31                                      ; preds = %2
  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
  %exitcond5 = icmp eq i64 %indvars.iv.next4, 100
  br i1 %exitcond5, label %32, label %.preheader

; <label>:32                                      ; preds = %31
  ret i32 0
}

attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}

-O1でオプティマイズをかけると4回分の演算を展開して、1つのループとして構成するようになった。

オプティマイズ：-O2

次に-O2でオプティマイズしてみよう。

opt -S -O1 -o function.opt2.ll function.ll

結果は次のようになる。

; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
  br label %.preheader

.preheader:                                       ; preds = %middle.block, %0
  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %middle.block ]
  %1 = mul i64 %indvars.iv3, 100
  %scevgep = getelementptr float, float* %dst, i64 %1
  %2 = add i64 %1, 99
  %scevgep7 = getelementptr float, float* %dst, i64 %2
  %scevgep9 = getelementptr float, float* %din, i64 %1
  %scevgep11 = getelementptr float, float* %din, i64 %2
  %scevgep13 = getelementptr float, float* %weight, i64 %1
  %scevgep15 = getelementptr float, float* %weight, i64 %2
  %3 = mul nuw nsw i64 %indvars.iv3, 100
  %bound0 = icmp ule float* %scevgep, %scevgep11
  %bound1 = icmp ule float* %scevgep9, %scevgep7
  %found.conflict = and i1 %bound0, %bound1
  %bound017 = icmp ule float* %scevgep, %scevgep15
  %bound118 = icmp ule float* %scevgep13, %scevgep7
  %found.conflict19 = and i1 %bound017, %bound118
  %conflict.rdx = or i1 %found.conflict, %found.conflict19
  br i1 %conflict.rdx, label %scalar.ph.preheader, label %vector.body.preheader

vector.body.preheader:                            ; preds = %.preheader
  br label %vector.body

scalar.ph.preheader:                              ; preds = %.preheader
  br label %scalar.ph

vector.body:                                      ; preds = %vector.body.preheader, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
  %4 = add i64 %index, %3
  %5 = getelementptr inbounds float, float* %din, i64 %4
  %6 = bitcast float* %5 to <4 x float>*
  %wide.load = load <4 x float>, <4 x float>* %6, align 4
  %7 = getelementptr inbounds float, float* %weight, i64 %4
  %8 = bitcast float* %7 to <4 x float>*
  %wide.load22 = load <4 x float>, <4 x float>* %8, align 4
  %9 = fmul <4 x float> %wide.load, %wide.load22
  %10 = getelementptr inbounds float, float* %dst, i64 %4
  %11 = bitcast float* %10 to <4 x float>*
  store <4 x float> %9, <4 x float>* %11, align 4
  %index.next = add i64 %index, 4
  %12 = icmp eq i64 %index.next, 100
  br i1 %12, label %middle.block.loopexit23, label %vector.body, !llvm.loop !1

scalar.ph:                                        ; preds = %scalar.ph, %scalar.ph.preheader
  %indvars.iv = phi i64 [ 0, %scalar.ph.preheader ], [ %indvars.iv.next.3, %scalar.ph ]
  %13 = add nuw nsw i64 %indvars.iv, %3
  %14 = getelementptr inbounds float, float* %din, i64 %13
  %15 = load float, float* %14, align 4
  %16 = getelementptr inbounds float, float* %weight, i64 %13
  %17 = load float, float* %16, align 4
  %18 = fmul float %15, %17
  %19 = getelementptr inbounds float, float* %dst, i64 %13
  store float %18, float* %19, align 4
  %indvars.iv.next = or i64 %indvars.iv, 1
  %20 = add nuw nsw i64 %indvars.iv.next, %3
  %21 = getelementptr inbounds float, float* %din, i64 %20
  %22 = load float, float* %21, align 4
  %23 = getelementptr inbounds float, float* %weight, i64 %20
  %24 = load float, float* %23, align 4
  %25 = fmul float %22, %24
  %26 = getelementptr inbounds float, float* %dst, i64 %20
  store float %25, float* %26, align 4
  %indvars.iv.next.1 = or i64 %indvars.iv, 2
  %27 = add nuw nsw i64 %indvars.iv.next.1, %3
  %28 = getelementptr inbounds float, float* %din, i64 %27
  %29 = load float, float* %28, align 4
  %30 = getelementptr inbounds float, float* %weight, i64 %27
  %31 = load float, float* %30, align 4
  %32 = fmul float %29, %31
  %33 = getelementptr inbounds float, float* %dst, i64 %27
  store float %32, float* %33, align 4
  %indvars.iv.next.2 = or i64 %indvars.iv, 3
  %34 = add nuw nsw i64 %indvars.iv.next.2, %3
  %35 = getelementptr inbounds float, float* %din, i64 %34
  %36 = load float, float* %35, align 4
  %37 = getelementptr inbounds float, float* %weight, i64 %34
  %38 = load float, float* %37, align 4
  %39 = fmul float %36, %38
  %40 = getelementptr inbounds float, float* %dst, i64 %34
  store float %39, float* %40, align 4
  %indvars.iv.next.3 = add nsw i64 %indvars.iv, 4
  %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 100
  br i1 %exitcond.3, label %middle.block.loopexit, label %scalar.ph, !llvm.loop !4

middle.block.loopexit:                            ; preds = %scalar.ph
  br label %middle.block

middle.block.loopexit23:                          ; preds = %vector.body
  br label %middle.block

middle.block:                                     ; preds = %middle.block.loopexit23, %middle.block.loopexit
  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
  %exitcond5 = icmp eq i64 %indvars.iv.next4, 100
  br i1 %exitcond5, label %41, label %.preheader

; <label>:41                                      ; preds = %middle.block
  ret i32 0
}

attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 1}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = distinct !{!4, !2, !3}

-O2のオプティマイズでは.preheader:が元のソースコードのyのforループ部分である。そして、注目スべき点はxのループがvector.body:とscalar.ph:ラベルの2つのループで構成された。ラベルの通り、ベクター処理かスカラー処理である。

vector.body:とscalar.ph:の分岐条件を確認してみるとメモリのランダム配置なのか一括配置なのかで分岐する。

オプティマイズ：-O3

次に-O3でオプティマイズしてみよう。

opt -S -O3 -o function.opt2.ll function.ll

-O3の結果は-O2と同じであった。

元のソースコード自体が複雑ではないので結果がおなじになってしまったのだと思われる。

まとめ

optでのアーキテクチャーはx86-64になっているので他のアーキテクチャーに変更するとベクタライズされなくなった。