SDSoCやVivado HLSにしてもプラグマを入れることで性能を左右することができる。
では、プラグマを指定してどこで性能を変化するように構成建てるのだろうか?
次のようなforループがあったとする。
#define HEIGHT 100
#define WIDTH 100
int function(float *dst, float *din, float *weight)
{
int y, x;
int offset;
for(y = 0; y < HEIGHT; ++y){
for(x = 0; x < WIDTH; ++x){
offset = y * HEIGHT + x;
dst[offset] = din[offset] * weight[offset];
}
}
return 0;
}
clangでLLVM-IRを出力してみよう。
clang -S -O0 -emit-llvm function.c
次のような結果が出力される。
; ModuleID = 'function.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: nounwind uwtable
define i32 @function(float* %dst, float* %din, float* %weight) #0 {
%1 = alloca float*, align 8
%2 = alloca float*, align 8
%3 = alloca float*, align 8
%y = alloca i32, align 4
%x = alloca i32, align 4
%offset = alloca i32, align 4
store float* %dst, float** %1, align 8
store float* %din, float** %2, align 8
store float* %weight, float** %3, align 8
store i32 0, i32* %y, align 4
br label %4
; <label>:4 ; preds = %35, %0
%5 = load i32, i32* %y, align 4
%6 = icmp slt i32 %5, 100
br i1 %6, label %7, label %38
; <label>:7 ; preds = %4
store i32 0, i32* %x, align 4
br label %8
; <label>:8 ; preds = %31, %7
%9 = load i32, i32* %x, align 4
%10 = icmp slt i32 %9, 100
br i1 %10, label %11, label %34
; <label>:11 ; preds = %8
%12 = load i32, i32* %y, align 4
%13 = mul nsw i32 %12, 100
%14 = load i32, i32* %x, align 4
%15 = add nsw i32 %13, %14
store i32 %15, i32* %offset, align 4
%16 = load i32, i32* %offset, align 4
%17 = sext i32 %16 to i64
%18 = load float*, float** %2, align 8
%19 = getelementptr inbounds float, float* %18, i64 %17
%20 = load float, float* %19, align 4
%21 = load i32, i32* %offset, align 4
%22 = sext i32 %21 to i64
%23 = load float*, float** %3, align 8
%24 = getelementptr inbounds float, float* %23, i64 %22
%25 = load float, float* %24, align 4
%26 = fmul float %20, %25
%27 = load i32, i32* %offset, align 4
%28 = sext i32 %27 to i64
%29 = load float*, float** %1, align 8
%30 = getelementptr inbounds float, float* %29, i64 %28
store float %26, float* %30, align 4
br label %31
; <label>:31 ; preds = %11
%32 = load i32, i32* %x, align 4
%33 = add nsw i32 %32, 1
store i32 %33, i32* %x, align 4
br label %8
; <label>:34 ; preds = %8
br label %35
; <label>:35 ; preds = %34
%36 = load i32, i32* %y, align 4
%37 = add nsw i32 %36, 1
store i32 %37, i32* %y, align 4
br label %4
; <label>:38 ; preds = %4
ret i32 0
}
attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
読みづらいかもしれないが大雑把におおよその流れはわかるだろう。
愚直に実行していることが読み取れる。
ここからオプティマイズをかけてみよう。
opt -S -O1 -o function.opt1.ll function.ll
まずは-O1の結果から見ていく。
; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
br label %.preheader
.preheader: ; preds = %31, %0
%indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %31 ]
%1 = mul nuw nsw i64 %indvars.iv3, 100
br label %2
; <label>:2 ; preds = %2, %.preheader
%indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next.3, %2 ]
%3 = add nuw nsw i64 %indvars.iv, %1
%4 = getelementptr inbounds float, float* %din, i64 %3
%5 = load float, float* %4, align 4
%6 = getelementptr inbounds float, float* %weight, i64 %3
%7 = load float, float* %6, align 4
%8 = fmul float %5, %7
%9 = getelementptr inbounds float, float* %dst, i64 %3
store float %8, float* %9, align 4
%indvars.iv.next = or i64 %indvars.iv, 1
%10 = add nuw nsw i64 %indvars.iv.next, %1
%11 = getelementptr inbounds float, float* %din, i64 %10
%12 = load float, float* %11, align 4
%13 = getelementptr inbounds float, float* %weight, i64 %10
%14 = load float, float* %13, align 4
%15 = fmul float %12, %14
%16 = getelementptr inbounds float, float* %dst, i64 %10
store float %15, float* %16, align 4
%indvars.iv.next.1 = or i64 %indvars.iv, 2
%17 = add nuw nsw i64 %indvars.iv.next.1, %1
%18 = getelementptr inbounds float, float* %din, i64 %17
%19 = load float, float* %18, align 4
%20 = getelementptr inbounds float, float* %weight, i64 %17
%21 = load float, float* %20, align 4
%22 = fmul float %19, %21
%23 = getelementptr inbounds float, float* %dst, i64 %17
store float %22, float* %23, align 4
%indvars.iv.next.2 = or i64 %indvars.iv, 3
%24 = add nuw nsw i64 %indvars.iv.next.2, %1
%25 = getelementptr inbounds float, float* %din, i64 %24
%26 = load float, float* %25, align 4
%27 = getelementptr inbounds float, float* %weight, i64 %24
%28 = load float, float* %27, align 4
%29 = fmul float %26, %28
%30 = getelementptr inbounds float, float* %dst, i64 %24
store float %29, float* %30, align 4
%indvars.iv.next.3 = add nsw i64 %indvars.iv, 4
%exitcond.3 = icmp eq i64 %indvars.iv.next.3, 100
br i1 %exitcond.3, label %31, label %2
; <label>:31 ; preds = %2
%indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
%exitcond5 = icmp eq i64 %indvars.iv.next4, 100
br i1 %exitcond5, label %32, label %.preheader
; <label>:32 ; preds = %31
ret i32 0
}
attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
-O1でオプティマイズをかけると4回分の演算を展開して、1つのループとして構成するようになった。
次に-O2でオプティマイズしてみよう。
opt -S -O1 -o function.opt2.ll function.ll
結果は次のようになる。
; ModuleID = 'function.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define i32 @function(float* nocapture %dst, float* nocapture readonly %din, float* nocapture readonly %weight) #0 {
br label %.preheader
.preheader: ; preds = %middle.block, %0
%indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %middle.block ]
%1 = mul i64 %indvars.iv3, 100
%scevgep = getelementptr float, float* %dst, i64 %1
%2 = add i64 %1, 99
%scevgep7 = getelementptr float, float* %dst, i64 %2
%scevgep9 = getelementptr float, float* %din, i64 %1
%scevgep11 = getelementptr float, float* %din, i64 %2
%scevgep13 = getelementptr float, float* %weight, i64 %1
%scevgep15 = getelementptr float, float* %weight, i64 %2
%3 = mul nuw nsw i64 %indvars.iv3, 100
%bound0 = icmp ule float* %scevgep, %scevgep11
%bound1 = icmp ule float* %scevgep9, %scevgep7
%found.conflict = and i1 %bound0, %bound1
%bound017 = icmp ule float* %scevgep, %scevgep15
%bound118 = icmp ule float* %scevgep13, %scevgep7
%found.conflict19 = and i1 %bound017, %bound118
%conflict.rdx = or i1 %found.conflict, %found.conflict19
br i1 %conflict.rdx, label %scalar.ph.preheader, label %vector.body.preheader
vector.body.preheader: ; preds = %.preheader
br label %vector.body
scalar.ph.preheader: ; preds = %.preheader
br label %scalar.ph
vector.body: ; preds = %vector.body.preheader, %vector.body
%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
%4 = add i64 %index, %3
%5 = getelementptr inbounds float, float* %din, i64 %4
%6 = bitcast float* %5 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %6, align 4
%7 = getelementptr inbounds float, float* %weight, i64 %4
%8 = bitcast float* %7 to <4 x float>*
%wide.load22 = load <4 x float>, <4 x float>* %8, align 4
%9 = fmul <4 x float> %wide.load, %wide.load22
%10 = getelementptr inbounds float, float* %dst, i64 %4
%11 = bitcast float* %10 to <4 x float>*
store <4 x float> %9, <4 x float>* %11, align 4
%index.next = add i64 %index, 4
%12 = icmp eq i64 %index.next, 100
br i1 %12, label %middle.block.loopexit23, label %vector.body, !llvm.loop !1
scalar.ph: ; preds = %scalar.ph, %scalar.ph.preheader
%indvars.iv = phi i64 [ 0, %scalar.ph.preheader ], [ %indvars.iv.next.3, %scalar.ph ]
%13 = add nuw nsw i64 %indvars.iv, %3
%14 = getelementptr inbounds float, float* %din, i64 %13
%15 = load float, float* %14, align 4
%16 = getelementptr inbounds float, float* %weight, i64 %13
%17 = load float, float* %16, align 4
%18 = fmul float %15, %17
%19 = getelementptr inbounds float, float* %dst, i64 %13
store float %18, float* %19, align 4
%indvars.iv.next = or i64 %indvars.iv, 1
%20 = add nuw nsw i64 %indvars.iv.next, %3
%21 = getelementptr inbounds float, float* %din, i64 %20
%22 = load float, float* %21, align 4
%23 = getelementptr inbounds float, float* %weight, i64 %20
%24 = load float, float* %23, align 4
%25 = fmul float %22, %24
%26 = getelementptr inbounds float, float* %dst, i64 %20
store float %25, float* %26, align 4
%indvars.iv.next.1 = or i64 %indvars.iv, 2
%27 = add nuw nsw i64 %indvars.iv.next.1, %3
%28 = getelementptr inbounds float, float* %din, i64 %27
%29 = load float, float* %28, align 4
%30 = getelementptr inbounds float, float* %weight, i64 %27
%31 = load float, float* %30, align 4
%32 = fmul float %29, %31
%33 = getelementptr inbounds float, float* %dst, i64 %27
store float %32, float* %33, align 4
%indvars.iv.next.2 = or i64 %indvars.iv, 3
%34 = add nuw nsw i64 %indvars.iv.next.2, %3
%35 = getelementptr inbounds float, float* %din, i64 %34
%36 = load float, float* %35, align 4
%37 = getelementptr inbounds float, float* %weight, i64 %34
%38 = load float, float* %37, align 4
%39 = fmul float %36, %38
%40 = getelementptr inbounds float, float* %dst, i64 %34
store float %39, float* %40, align 4
%indvars.iv.next.3 = add nsw i64 %indvars.iv, 4
%exitcond.3 = icmp eq i64 %indvars.iv.next.3, 100
br i1 %exitcond.3, label %middle.block.loopexit, label %scalar.ph, !llvm.loop !4
middle.block.loopexit: ; preds = %scalar.ph
br label %middle.block
middle.block.loopexit23: ; preds = %vector.body
br label %middle.block
middle.block: ; preds = %middle.block.loopexit23, %middle.block.loopexit
%indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
%exitcond5 = icmp eq i64 %indvars.iv.next4, 100
br i1 %exitcond5, label %41, label %.preheader
; <label>:41 ; preds = %middle.block
ret i32 0
}
attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 1}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = distinct !{!4, !2, !3}
-O2のオプティマイズでは.preheader:が元のソースコードのyのforループ部分である。 そして、注目スべき点はxのループがvector.body:とscalar.ph:ラベルの2つのループで構成された。 ラベルの通り、ベクター処理かスカラー処理である。
vector.body:とscalar.ph:の分岐条件を確認してみるとメモリのランダム配置なのか一括配置なのかで分岐する。
次に-O3でオプティマイズしてみよう。
opt -S -O3 -o function.opt2.ll function.ll
-O3の結果は-O2と同じであった。
元のソースコード自体が複雑ではないので結果がおなじになってしまったのだと思われる。
optでのアーキテクチャーはx86-64になっているので他のアーキテクチャーに変更するとベクタライズされなくなった。