A parallel 'for' loop memory template for a high level synthesis compiler
假設大家對 Bus 有初步的概念後. 底下我們就來 hack LLVM 的 IR 轉成我們所需要的 IR 格式.可參考之前的 post Parallel IR gen. 藉由改變 Instruction 的 link 來表示每個 Instruction 所在的 cycle 位置. 除此之外. 這邊我們可以加入 Bus 的 cofactor 進來.
sample code.
View more presentations from Craig Moore
#define ADD_RSHT(a,b) (a+b)>>1 void PP(unsigned int *a, unsigned int *b){ unsigned int r; for(r=0; r<1024; r=r+4){ b[r] = ADD_RSHT(a[r] ,a[r+1]); b[r+1] = ADD_RSHT(a[r+1],a[r+2]); b[r+2] = ADD_RSHT(a[r+2],a[r+3]); b[r+3] = ADD_RSHT(a[r+3],a[r+4]); } }假設 pp Module 中 support 4 個 ADD_RSHT, 且 a[], b[] 為外部的 Memory. 結構大致如下所示. 透過 LLVM 轉出 IR byte-code
%arrayidx = getelementptr i32* %a, i32 %tmp ... %arrayidx6 = getelementptr i32* %a, i32 %tmp ... %tmp6872 = or i32 %tmp, 3 %arrayidx36 = getelementptr i32* %a, i32 %tmp6872 %arrayidx59 = getelementptr i32* %b, i32 %tmp6872 %tmp69 = add i32 %tmp, 4 %arrayidx52 = getelementptr i32* %a, i32 %tmp69 %tmp3 = load i32* %arrayidx, align 4, !tbaa !0 %tmp7 = load i32* %arrayidx6, align 4, !tbaa !0 ...可以發現 load/store 的 value index 為 arrayidx(x). 這邊我們就把這樣的case 視為moudle PP 透過 bus 來 access external memory.如果把 a[r],a[r+1]... 視為連續的 Memory address.是不是就可以用 bus 的 Burst mode 來簡化. ex:
a[r] @ address 0x00000000 a[r+1] => @ burst 4 a[r+2] @ memory read a[r+3] @ decoder/encoder...有上面的概念後.下就hack LLVM 的 IR 產生我們所需要的 Bus IR.除此之外還可以加入點分析 Model進來,做到 co-analysis 的動作. Results:
External.Mem.Load: ; preds = %for.body %External_Mem_op15 = shl i32 %indvar, 2 %External_Mem_arr14 = getelementptr i32* %a, i32 %External_Mem_op15 %External_Mem_op13 = or i32 %External_Mem_op15, 1 %External_Mem_arr12 = getelementptr i32* %a, i32 %External_Mem_op13 %External_Mem_op11 = or i32 %External_Mem_op15, 2 %External_Mem_arr10 = getelementptr i32* %a, i32 %External_Mem_op11 ... %External_Mem_ld7 = load i32* %External_Mem_arr14 %External_Mem_ld6 = load i32* %External_Mem_arr12 %External_Mem_ld5 = load i32* %External_Mem_arr12 ... br label %Exe Exe: ; preds = %External.Mem.Load %Exe_op26 = add i32 %External_Mem_ld6, %External_Mem_ld7 %Exe_op25 = lshr i32 %Exe_op26, 1 %Exe_op24 = add i32 %External_Mem_ld4, %External_Mem_ld5 %Exe_op23 = lshr i32 %Exe_op24, 1 %Exe_op22 = add i32 %External_Mem_ld2, %External_Mem_ld3 %Exe_op21 = lshr i32 %Exe_op22, 1 %Exe_op20 = add i32 %External_Mem_ld, %External_Mem_ld1 %Exe_op = lshr i32 %Exe_op20, 1 br label %External.Mem.Store External.Mem.Store: ; preds = %Exe %External_Mem_arr19 = getelementptr i32* %b, i32 %External_Mem_op15 %External_Mem_arr18 = getelementptr i32* %b, i32 %External_Mem_op13 %External_Mem_arr17 = getelementptr i32* %b, i32 %External_Mem_op11Project download https://sites.google.com/site/funningboy/c/llvm_bus.tar.gz?attredirects=0&d=1 ps: 可以參考 learning plus: C to Verilog notes 中的 reduce map 方式,把相同的 Instruction 做 reduce. ex: Old
%External_Mem_ld6 = load i32* %External_Mem_arr12 %External_Mem_ld5 = load i32* %External_Mem_arr12ex: New
%External_Mem_ld6 = load i32* %External_Mem_arr12 ...
沒有留言:
張貼留言