learning plus: 3月 2011

2011年3月30日星期三

LLVM oIR code gen

在現今的 CPU 架構中都有 support SIMD(single instruction multi data) 的概念,來做到hardware 加速.底下小弟就舉個最簡單的範例吧. 假設底下 ex(1) 為原始的 hardware code. ex(1)

void SCALAR(unsigned int a, unsigned int b, unsigned int *c){

    unsigned d = (a+b)>>1;
             d = d+2;
            *c = d;
}

ex(2) 為 clang compile 後的 LLVM IR. 在ex(2)中,我們發現 hardware 要先做 "add",在繼續做 "lshr" 的動作.所以要消耗兩個 cycle 的時間.今天假設我們的硬體support "add+lshr" 的指令.是不是就只需要一個 cycle 的時間就可完成.如ex(3)所示.這樣我們就可以減少 hardware Instruction counts.相對的效能就可以提升.有點類似 CUDA/ARM 裡面的特殊指令集拉... ex(2)

define void @SCALAR(i32 %a, i32 %b, i32* nocapture %c) nounwind {
entry:
 %add = add i32 %b, %a
 %shr = lshr i32 %add, 1
 %add3 = add i32 %shr, 2
 store i32 %add3, i32* %c, align 4, !tbaa !0
 ret void
}

ex(3)

define void @SCALAR(i32 %a, i32 %b, i32* nocapture %c) nounwind {
entry:
 %shr = add_lshr i32 %b, %a, 1
 %add3 = add i32 %shr, 2
 store i32 %add3, i32* %c, align 4, !tbaa !0
 ret void
}

sample code @ llvm


#define DEBUG_TYPE "oIRPass"

#include "llvm/Pass.h"
#include "llvm/Module.h"
#include "llvm/Function.h"
#include "llvm/Instructions.h"
#include "llvm/Support/CFG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/Dominators.h"

#include <iostream>
#include <string>
#include <sstream>
#include <vector>
#include <set>
#include <map>

using namespace llvm;
using namespace std;

 
  class oIRPass : public FunctionPass {

        public:

            static char ID;

            oIRPass() : FunctionPass(ID) {}

           ~oIRPass(){ 
                       } 

            virtual void getAnalysisUsage(AnalysisUsage &AU) const {
                  AU.setPreservesCFG();
            }

            bool runOnFunction(Function &F);

            void setInstAddLshrSucc(Instruction *Inst);

            void chkInstAddLshrSucc();

            std::string getInstAddLshrToString(Instruction *iPreInst,Instruction *iCutInst);

            std::string getResultToString(Instruction *Inst);

       private:
           std::vector<Instruction*> oIRInstSuccVec;
           std::map<Instruction*,std::string> oIRInstMap;
   };


  // get <Result>
  std::string oIRPass::getResultToString(Instruction *Inst){

  Instruction *IT = dyn_cast<Instruction>(Inst);
  assert( IT!=NULL && "getResultToString Error");
 
  bool pass = false;

  // @ successor Instruction::xx check
  for (Instruction::use_iterator uit = IT->use_begin(); uit!= IT->use_end(); ++uit) {
       Instruction *NIT = dyn_cast<Instruction>(*uit);
       assert( NIT!=NULL && "getResultToString Error");    

       for(unsigned i=0; i<NIT->getNumOperands(); ++i){
           Value *val = NIT->getOperand(i);

         if(dyn_cast<Instruction>(val)){
          Instruction *PIT = dyn_cast<Instruction>(val);

          if( PIT==IT ){
          pass = true;
          return val->getNameStr();
          }
       }

      }
    }

   assert( pass==true && "getResultToString Error");
  }





  // Instruction 2 string
  // ex: %tmp22 = addlshr i32 %i,%j, 1 <<---
  std::string oIRPass::getInstAddLshrToString(Instruction *iPreInst,Instruction *iCutInst){

  Instruction *PreInst = dyn_cast<Instruction>(iPreInst);
  Instruction *CutInst = dyn_cast<Instruction>(iCutInst);

  BinaryOperator *prebin = dyn_cast<BinaryOperator>(PreInst);
  assert( PreInst!=NULL && prebin!=NULL && prebin->getOpcode()== Instruction::Add && "getInstAddLshrToString Error");

  BinaryOperator *cutbin = dyn_cast<BinaryOperator>(CutInst);
  assert( CutInst!=NULL && cutbin!=NULL && cutbin->getOpcode()== Instruction::LShr && "getInstAddLshrToString Error");

  // @ const cehck
  Value *val2 = cutbin->getOperand(1);
  ConstantInt *c2 = dyn_cast<ConstantInt>(val2);
  assert( c2!=NULL && "getInstAddLshrToString Error");
  
  Value *val0 = prebin->getOperand(0);
  Value *val1 = prebin->getOperand(1);

  unsigned int prebinWidth = cast<IntegerType>(prebin->getType())->getBitWidth();
  unsigned int cutbinWidth = cast<IntegerType>(cutbin->getType())->getBitWidth();
 
  // @ width check
  assert( cutbinWidth<=prebinWidth && prebinWidth==32 && "getInstAddLshrToString Error");

  stringstream sb;
  sb << c2->getZExtValue();
  std::string st = "%"+getResultToString(CutInst)+" = addlshr i32 "+"%"+val0->getNameStr()+", "+"%"+val1->getNameStr()+", "+ sb.str();
 
  return st;
 }







  // set Instruction successor match Add->LSHR?
  void oIRPass::setInstAddLshrSucc(Instruction *Inst){

  Instruction *IT = dyn_cast<Instruction>(Inst);
  assert( IT!=NULL && "setInstructionSucc Error");

  oIRInstSuccVec.clear();

  if(dyn_cast<BinaryOperator>(IT)){
     BinaryOperator *bin = dyn_cast<BinaryOperator>(IT);

     // @ Instruction::A 
     if(bin->getOpcode() == Instruction::Add){
     oIRInstSuccVec.push_back(IT);

     // @ successor Instruction::Add check
     for (Instruction::use_iterator uit = IT->use_begin(); uit!= IT->use_end(); ++uit) {
          Instruction *NIT = dyn_cast<Instruction>(*uit);
  
         if(dyn_cast<BinaryOperator>(NIT)){
             BinaryOperator *nbin = dyn_cast<BinaryOperator>(NIT);
         
             if(nbin->getOpcode() == Instruction::LShr)
             oIRInstSuccVec.push_back(NIT);

          }    
       }
     }

   }
  }







 // check Instruction successor match Add->LShr?
 void oIRPass::chkInstAddLshrSucc(){

      typedef std::vector<Instruction*>::iterator IT_iterator;

      // @ Instruction::Add token
      IT_iterator ITB = oIRInstSuccVec.begin();

      Instruction    *PreIT  = dyn_cast<Instruction>(*ITB);
      BinaryOperator *prebin = dyn_cast<BinaryOperator>(PreIT);
      assert( PreIT!=NULL && prebin!=NULL && prebin->getOpcode()== Instruction::Add && "chkInstructionSucc Error");
   
     // @ Instruction::LShr token 
  for(IT_iterator IB = oIRInstSuccVec.begin(), IE = oIRInstSuccVec.end(); IB != IE; ++IB) { 

      Instruction    *CutIT  = dyn_cast<Instruction>(*IB);
      BinaryOperator *cutbin = dyn_cast<BinaryOperator>(CutIT);
      assert( CutIT!=NULL && cutbin!=NULL && "chkInstructionSucc Error");

      Value *val1 = cutbin->getOperand(1);

      if(cutbin->getOpcode()== Instruction::LShr && dyn_cast<ConstantInt>(val1))
      errs() << "oIR found :: " << getInstAddLshrToString(PreIT, CutIT) << "\n";
                            

    } //end for
 }






  bool oIRPass::runOnFunction(Function &F) {
 
        for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE; ++BB) {
                   BasicBlock  *BC = dyn_cast<BasicBlock>(BB);

             for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ++IB) {  
                          Instruction *IT = dyn_cast<Instruction>(IB);

                      setInstAddLshrSucc(IT);
                      chkInstAddLshrSucc(); 
          } 
        } 

       return true;
    }

    char oIRPass::ID = 0;
    RegisterPass<oIRPass> PXX("oIR_pass", "oIR lib test",false,false);

Results: oIR found :: %shr = addlshr i32 %b, %a, 1 refs: LLVM + BOOST = LMBOOST.... Sampe Pass @ LLVM LLVM 2.8 env set && pass manager set

2011年3月27日星期日

CUDA + LLVM = GpuOcelot

小弟最近在跟 GpuOcelot 的 project.底下先就 "build" 跟 "graph" 的一些觀念介紹. Build Part how to install GpuOcelot? Step1. external lib build 可參考 svn 的方式.http://code.google.com/p/gpuocelot/wiki/Installation,把所需要的lib 依序 build 起來. Refs: LLVM 2.8 env set && pass manager set Boost 1.4.X ps: default prefix 路徑在 /usr/local/bin, /usr/local/lib 下, 且記得加入到 $PATH 下.

% export PATH=$PATH:/usr/local/bin:/usr/local/lib
% $PATH

Step2. hack source code. 2-1. boost env command 在 1.4 版之後 -lboost_filesystem-mt 改成 -lboost_filesystem.所以要改 ./scripts/build_environment.py 內 boost set.

//@ original
env.Replace(EXTRA_LIBS=['-lboost_system-mt',
                     '-lboost_filesystem-mt', \
                     '-lboost_thread-mt', '-lGLEW'])

//@ new
env.Replace(EXTRA_LIBS=['-lboost_system',
                     '-lboost_filesystem', \
                     '-lboost_thread', '-lGLEW'])

2-2. string 2 char* ps : 好像在之前的版本沒有這個問題說...

// @ original
//      std::ofstream file(_path + "prof." + _kernelName + "_controlFlow.dot");

// @ new
std::string path =  _path + "prof." + _kernelName + "_controlFlow.dot";
std::ofstream file(path.c_str());

2-3. enum of class 在./gpuocelot/ocelot/ocelot/ir/interface/PTXOperand.h 中定義了 PTXOperand 的 Layer 如ex(1). 所以要存取 Register時, 應該是 ir::PTXOperand::Register, 而不是 ir::PTXOperand::AddressMode::Register,如果硬要把 AddressMode 加入的話.應該要改成 ex(2), 不過改成 ex(2) 後跟 design sink 不起來.所以還是把 AddressMode 拿掉如 ex(3). ex(1)

// @ original
namespace ir {
...
class PTXOperand {
      enum AddressMode {
                         Register,
      }
...
}

ex(2)

// @ new
namespace ir {
...
class PTXOperand {
   class AddressMode {
      enum AddressMode {
                         Register,
      }
   }
...

ex(3)

// @ original                         
if (ptxInstruction->a.addressMode == ir::PTXOperand::AddressMode::Special) {
                        
// @ new
if (ptxInstruction->a.addressMode == ir::PTXOperand::Special) {

2-4 lib include

#include "boost/array.hpp"
...

Step3. run & build test

//@ build a& install
% ./build.py -i

//@ build test all
% ./build.py -t full

cd .release_build/

Graph theories Part 可以先從 "Advanced Compiler Design and Implementation" by Steven Muchnick 這本書看起. Ocelot 裡面有很多 Analysis 的方式是來自這本書的...當然小弟也是在進修中..XD Dominator (graph theory) SSA Graph Tree traversal

2011年3月25日星期五

C to Verilog notes

底下列出目前在 C to Verilog 所看到的 features. @ Pre-Synthesis part 1. Parallel support Ref : parallel synthesis @ llvm 2. reduce redundant nodes

ex: original
a = 3+4;        
b = b+a;

ex: new
// cut instruction "a = 3+4" & move the new point to next instruction
b = b+7

3. reduce redundant bit width

ex: original
int32_t a,c;
a =3;
c = 32'b(a)+32'b(1);

ex: new
int8_t a;
int9_t c;
a =3;
c = 8'b(a)+8'b(1);

4. remove Allocation

ex: original

void A(){
...
B (a,b,&c);
}

void B(int a,int b, int *c){
*c = a+b;
}

ex: new
void A(){
...
c = a+b;
}

5. instruction priority

c=a+b; instruction 1
d=c+1; instruction 2
priority 1 > 2
...

6. PHI node ignore. @ constrain at each Basic Block the data had already done.

ps: @ preprocessor Basic Block the PHI in-alive Nodes had already done.
ex: 
%i.08 = phi i32 [ 0, %entry ], [ %inc, %popCnt.exit ]
%arrayidx = getelementptr i32* %B, i32 %i.08
...

//0    @ Basic block entry
//%inc @ Basic block popCnt.exit & replace it be %i.08

7. reduce redundant instructions

@ PHI part
%i.06.i = phi i32 [ 0, %for.body ], [ %inc.i, %for.body.i ]
%i.07.i = phi i32 [ 0, %for.body ], [ %inc.i, %for.body.i ]

// cut %i.07.i instruction

@ Operand   
%add.01.i = add i32 %and.i, %sum.07.i
%add.02.i = add i32 %and.i, %sum.07.i

// cut %add.02.i instruction

8. loop with hardware resource constrain "2" Add.

ex : original
for(int i=0; i<10; i++)
   a[i] = b[i]+1;

...

ex : new
for(int i=0; i<10; i=i+2){
    a[i]   = b[i]+1;
    a[i+1] = b[i+1]+1;
}

...

@ Synthesis part 1. Global Value @ In Verilog Module not support the local value

ex : @ Verilog
 
module xx(...);
input ...
output ...
reg ...
wire ...

...

always@(posedge clk)begin
...
end

...
endmodule

2. Schedule list external memory support. split the memory load instruction into two Basic blocks @ Memory request(address/mode)block , Memory (wait the require data)read block. ps: each Basic Block should be done at one cycle clock.

ex: original
for.body:
// @ address/mode phase
%arrayidx = getelementptr i32* %B, i32 %i.08

// @ data phase
%tmp3 = load i32* %arrayidx, align 4, !tbaa !0
...

ex: new

for.body.phase1:
%arrayidx = getelementptr i32* %B, i32 %i.08
br label %for.body.phase2

for,body.phase2:
%tmp3 = load i32* %arrayidx, align 4, !tbaa !0

@ Verilog view

module(...)
output [31:0] mem_address;
output [0 :0] mem_mode;
output [31:0] mem_store_data;
input  [31:0] mem_load_data;

reg [3 :0] cur_state,nxt_state;
reg [31:0] a;
....

case(cur_state)
for_body_phase1 : begin 
                        mem_address = 0x00000000; 
                        mem_mode    = read;
  
                        nxt_state = for_body_phase2; 
                  end

for_body_phase2 : begin 
                        a = mem_load_data;
                        
                        nxt_state = alu_phase;
                  end

alu_phase       : 
...

...
endcase
...

cur_state = nxt_state;

3. pipeline support unroll the loop & pipeline insert. @ example case

ex: original

%cat test.c

static inline unsigned int popCnt(unsigned int input) { 
    unsigned int sum = 0; 
    for (int i = 0; i < 32; i++) {
        sum += (input) & 1; 
        input = input/2; 
    } 
    return sum; 
}

how to unroll loop ?

// step1. gen IR bytecode
%clang -O3 -emit-llvm test.c -e -o test.bc

// step2. unroll the loop in bytecode
// you can check the opt by "opt -help | grep loop"
%opt -loop-unroll -unroll-count 20 test.bc -debug | llvm-dis

// step3. if step2 pass, then output new bytecode
%opt -loop-unroll -unroll-count 20 test.bc > opt_test.bc

un-pipeline IR & view


%llvm-dis opt_test.bc > opt_test.ll

%cat opt_test.ll

for.body.i.1: ...Instruction A   br label %for.body.i.2
for.body.i.2: ...Instruction B   br label %for.body.i.3
for.body.i.3: ...Instruction C   br label %for.body.i.4
for.body.i.4: ...Instruction D   br label %for.body.i.5
...

pipeline it

for.body.i.1 : ...Instruction A         br label %for.body.i.2
for.body.i.2 : ...Instruction B,A       br label %for.body.i.3
for.body.i.3 : ...Instruction C,B,A     br label %for.body.i.4
for.body.i.4 : ...Instruction D,C,B,A   br label %for.body.i.5
for.body.i.6 : ...Instruction   D,C,B   br label %for.body.i.7
for.body.i.8 : ...Instruction     D,C   br label %for.body.i.9
for.body.i.10: ...Instruction       D   br label %for.exit.i.0

for.exit.i.0 :
...

2011年3月24日星期四

LLVM + BOOST = LMBOOST....

底下就把 LLVM + Boost c++ 做簡單的結合. 有 Lib 就是個 "爽"


#define DEBUG_TYPE "BoostPass"

#include "llvm/Pass.h"
#include "llvm/Module.h"
#include "llvm/Function.h"
#include "llvm/Instructions.h"
#include "llvm/Support/CFG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/Dominators.h"

#include <boost/graph/graph_traits.hpp>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/dijkstra_shortest_paths.hpp>


#include <iostream>
#include <string>
#include <sstream>
#include <vector>
#include <set>
#include <map>

using namespace llvm;
using namespace boost;
using namespace std;

 
    template<class T>
    class BoostEdge {
     
       public:
         BoostEdge(T iEdgeFm,
                   T iEdgeTo) : InstructionEdgeFm(iEdgeFm),
                                InstructionEdgeTo(iEdgeTo) {}
        ~BoostEdge(){} 

         T GetInstructionEdgeFm(){ return InstructionEdgeFm; }
         T GetInstructionEdgeTo(){ return InstructionEdgeTo; }

       private:
        T InstructionEdgeFm;
        T InstructionEdgeTo;
    };

    class BoostPass : public FunctionPass {

        public:

            static char ID;

            BoostPass() : FunctionPass(ID) { LLVM2BoostInx=0; }

           ~BoostPass(){ 
                         BoostEdgeList.clear();
                         BoostNodeList.clear();
                         LLVM2BoostMapList.clear();
                         Boost2LLVMMapList.clear(); 
                       } 

            virtual void getAnalysisUsage(AnalysisUsage &AU) const {
                  AU.setPreservesCFG();
            }

            bool runOnFunction(Function &F);

            void setInstructionLLVM2Boost(Instruction *inst);

            int  getIndexLLVM2Boost(Instruction *inst);


       private:
          std::vector<BoostEdge<int>*>  BoostEdgeList;          
          std::vector<Instruction*>     BoostNodeList;

          std::map<Instruction*,int>    LLVM2BoostMapList;
          std::map<int,Instruction*>    Boost2LLVMMapList;

          int LLVM2BoostInx;
    };


  void  BoostPass::setInstructionLLVM2Boost(Instruction *inst){

        if(std::find(BoostNodeList.begin(),BoostNodeList.end(), inst) == BoostNodeList.end()){
           BoostNodeList.push_back(inst);
           LLVM2BoostMapList[inst] = LLVM2BoostInx++; 
        } 
  }

  int  BoostPass::getIndexLLVM2Boost(Instruction *inst){

       std::map<Instruction*,int>::iterator it = LLVM2BoostMapList.find(inst);

       if(it!=LLVM2BoostMapList.end())
       return it->second;

       return -1;
  }

  bool BoostPass::runOnFunction(Function &F) {
 
        for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE; ++BB) {
                   BasicBlock  *BC = dyn_cast<BasicBlock>(BB);

             for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ++IB) {  
                          Instruction *IT = dyn_cast<Instruction>(IB);

                  // @ Instruction::Add , Sub ...
                  if(dyn_cast<BinaryOperator>(IT)){

                    for (Instruction::use_iterator uit = IT->use_begin(); uit!= IT->use_end(); ++uit) {

                        if(dyn_cast<BinaryOperator>(*uit)){
                           Instruction *NT = dyn_cast<Instruction>(*uit);

                           //@ same basic block 
                           if( NT->getParent()==BC ){
                               errs() << "LLVM->From Instruction :: " << *IT << "\n";
                               errs() << "LLVM->To   Instruction :: " << *NT << "\n";
                               errs() << "\n";

                               setInstructionLLVM2Boost(IT);
                               setInstructionLLVM2Boost(NT);

                               int it = getIndexLLVM2Boost(IT); 
                               int nt = getIndexLLVM2Boost(NT);
                               
                               assert( it!=-1 && nt!=-1 && "LLVM2Boost Error<1>");                               

                               BoostEdge<int> *BtEdgePtr = new BoostEdge<int>(it,nt);
                               BoostEdgeList.push_back(BtEdgePtr); 
                          }
                        }
                    }
                 }
          
          } //end Instruction iterator

          // @ LLVM 2 Boost Graph
          const unsigned int NodeSize = BoostNodeList.size()+1;  
          const unsigned int EdgeSize = BoostEdgeList.size();

          if( NodeSize>1 && EdgeSize>0 ){

          typedef adjacency_list<vecS, vecS, bidirectionalS> Graph;

          typedef std::pair<int, int> Edge;

          Graph g(NodeSize);

          for(std::vector<BoostEdge<int>*>::iterator IE = BoostEdgeList.begin(); IE != BoostEdgeList.end(); ++IE)
              boost::add_edge((*IE)->GetInstructionEdgeFm(), (*IE)->GetInstructionEdgeTo(), g);

          
          // get the property map for vertex indices
          typedef property_map<Graph, vertex_index_t>::type IndexMap;
          IndexMap index = get(vertex_index, g);

          std::cout << "vertices(g) = ";
          typedef graph_traits<Graph>::vertex_iterator vertex_iter;
          std::pair<vertex_iter, vertex_iter> vp;
          for (vp = vertices(g); vp.first != vp.second; ++vp.first)
            std::cout << index[*vp.first] <<  " ";
            std::cout << std::endl;

          // @ Boost Graph Algorithm
          // ....


          // @ Boost Graph Result 2 LLVM 
          // ...
 
         }

        BoostNodeList.clear();
        BoostEdgeList.clear();

        LLVM2BoostMapList.clear();
        Boost2LLVMMapList.clear();

        LLVM2BoostInx =0;
   
        } // end basic block iterator

       return true;
    }

    char BoostPass::ID = 0;
    RegisterPass<BoostPass> PXX("Boost_pass", "Boost c++ lib test",false,false);

compile

opt -load Debug+Asserts/lib/Boost_t.so  -Boost_pass test/scalar.bc

ps: 後來發現可以用 std::pair 來取代 Edge class..XD Table of Contents: the Boost Graph Library

2011年3月22日星期二

Sample Pass @ LLVM

寫了個簡單的 sample @ LLVM, 就當練習練習...XD. 不過沒有考慮 function 的正確性.

#define DEBUG_TYPE "SamplePass"

#include "llvm/Pass.h"
#include "llvm/Module.h"
#include "llvm/Function.h"
#include "llvm/Instructions.h"
#include "llvm/Support/CFG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/Dominators.h"

#include <iostream>
#include <string>
#include <sstream>
#include <vector>
#include <set>
#include <map>

using namespace llvm;

using std::set;
using std::map;
using std::vector;
using std::pair;
using std::string;

    class SamplePass : public FunctionPass {

        public:

            static char ID;

            SamplePass() : FunctionPass(ID) {}

            virtual void getAnalysisUsage(AnalysisUsage &AU) const {

                  AU.setPreservesCFG();
            }

            bool runOnFunction(Function &F);

    };


  bool SamplePass::runOnFunction(Function &F) {
 
        for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE; ++BB) {
                   BasicBlock  *BC = dyn_cast<BasicBlock>(BB);

             std::vector<Instruction*> RemoveList;
                                       RemoveList.clear();
  
             for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ++IB) {  
                          Instruction *IT = dyn_cast<Instruction>(IB);

                // @ get Array Load instruction 
                 if (dyn_cast<GetElementPtrInst>(IB)) {

                    for (Instruction::use_iterator uit = IT->use_begin(); uit!= IT->use_end(); ++uit) {
                        if(dyn_cast<LoadInst>(*uit)){
                           
                           Instruction *NT = dyn_cast<Instruction>(*uit);
                         
                           errs() << "getelementptr ::" << *IT << "\n";
                           errs() << "load          ::" << *NT << "\n";
                           errs() << "\n";
                        }
                    }
                 }

              // @ get ALL PHI values && same basic block check        
                if(dyn_cast<PHINode>(IT)){
                   PHINode *phi = dyn_cast<PHINode>(IT);

                   unsigned incs = phi->getNumIncomingValues(); 
                   for(unsigned i =0; i<incs; i++){ 

                      BasicBlock  *INB = phi->getIncomingBlock(i);
                      Value       *val = phi->getIncomingValue(i);

                      if( INB==BC ){
                         errs() << "Instruct  ::" << *IT << "\n";
                         errs() << "PHI value ::" << *val << "\n";
                         errs() << "\n";
                      }  
                   }
                }

              // @ get Binary Operator Instruction::Add
              if(dyn_cast<BinaryOperator>(IT)){
                 BinaryOperator* bin = dyn_cast<BinaryOperator>(IT);

                if (bin->getOpcode() == Instruction::Add) {
                    Value *val0 = IT->getOperand(0);
                    Value *val1 = IT->getOperand(1);

                    errs() << "Add Value0 ::" << *val0 << "\n";
                    errs() << "Add Value1 ::" << *val1 << "\n"; 
                }
              }


            // @ insert Sub Instruction after Add Instruction
            if(dyn_cast<BinaryOperator>(IT)){ 
               BinaryOperator* bin = dyn_cast<BinaryOperator>(IT);

               if (bin->getOpcode() == Instruction::Add){
                   Value *val0 = IT->getOperand(0);
                   Value *val1 = IT->getOperand(1);

                   BinaryOperator::Create(Instruction::Sub, val0, val1, "SUB", IT);
      
               }
            }

            // @ remove ALL Instruction::Add && link the Value0 at next connect
            if(dyn_cast<BinaryOperator>(IT)){
               BinaryOperator* bin = dyn_cast<BinaryOperator>(IT);

                if (bin->getOpcode() == Instruction::Add) {

                       Value *val0 = IT->getOperand(0);

                       IT->replaceAllUsesWith(val0);
                       RemoveList.push_back(IT);
             }
           }

           
          } //end Instruction iterator
  
         // @ remove ALL Instruction::Add 
         for(std::vector<Instruction*>::iterator rmIt  = RemoveList.begin();
                                                 rmIt != RemoveList.end();   ++rmIt){

                   Instruction *It = dyn_cast<Instruction>(*rmIt);
                                It->eraseFromParent();
          }

          RemoveList.clear();


        } // end basic block iterator

        return true;
    }

    char SamplePass::ID = 0;
    RegisterPass<SamplePass> PXX("Sample_pass", "extract Sample DFG",false,false);

compile

opt -load Debug+Asserts/lib/Parallel_t.so  -Sample_pass test/opt_scalar.bc | llvm-dis | grep sub

2011年3月20日星期日

parallel synthesis @ llvm

我們在machine code 上看到的指令是cycle by cycle 的形式,但實際在硬體上卻可以透過多個 hardware resource 做到 parallel 的方式.如在 Verilog 的語法上用 '(' ')' 來 assign hardware. 底下就用 C 2 Verilog 內的例子來講解 ps: 不管 RHS, LHS 的相關性.這邊只考慮同 alu type 的 instruction. 且至少連續3個 ALU 以上的 instruction. 假設一個sample 的 hardware IP named test. ex : sample c code

void test(int a,int b, int c, int d, int e, int *o){
     *o = a+b+c+d+e;
}

透過 GCC compile 之後的 machine code 可以看出是 cycle by cycle 的型態.

test:
        movl    8(%esp), %eax
        addl    4(%esp), %eax
        addl    12(%esp), %eax
        addl    16(%esp), %eax
        addl    20(%esp), %eax
        movl    24(%esp), %ecx
        movl    %eax, (%ecx)
        ret

在 llvm IR 中也是如此.

define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32* nocapture %o) nounwind {
entry:
  %add = add i32 %b, %a
  %add3 = add i32 %add, %c
  %add5 = add i32 %add3, %d
  %add7 = add i32 %add5, %e
  store i32 %add7, i32* %o, align 4, !tbaa !0
  ret void
}

轉成 Graph 來看.大致是如此

但是從 hardware 的角度來看,不管 hardware resource constrains 可以長成這樣子

這邊我們只要用到3個clock cycle 跟兩個 ALU IP ADDs 就可以完成.有了以上的觀念後. 其實最後目標就是改變 llvm 的 IR 型態來符合我們的假設. 改變後的 IR, 可發現出 parallel 的方式.

define void @test(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32* nocapture %o) nounwind {
entry:
  %lowlevel = add i32 %b, %a
  %lowlevel1 = add i32 %c, %d
  %lowlevelOdd = add i32 %e, %lowlevel1
  %headNode = add i32 %lowlevel, %lowlevelOdd
  store i32 %headNode, i32* %o, align 4, !tbaa !0
  ret void
}

ps: 其實在 verilog 中,可以用 (a+b)+(c+d)+e 的方式來做出 parallel 的合成.

2011年3月16日星期三

DFG @ c++

如果你對 llvm 很熟的話. 那麼這篇你看了應該會吐血....因為....小弟沒把 Reference manual 看熟,就自己硬幹底層的 class 導致跟 frond end 的 llvm interface 接起來有點醜. 感覺是小弟自己重複定義了 llvm 已有的 class. 如 llvm 中的 Instruction 可用 dyn_cast 的方式轉成 binary operator 跟 Value 的型態. 而我們定義了 Node, Edge 來描述 Instruction 的結構.... 類似的情況. 而 llvm 可以藉由自己定義的 library 來改變 IR. 而我們是改變 Graph 的 type... 反正就是一個字來形容 "菜" .XD 主要的問題就是在於 llvm 是用 Transport Triggered Architecture 的方式來 link dependent instruction. 就 (tmp3+tmp1)>>1 的結果而言. llvm 會把 tmp3+tmp1 的結果(%add) 的 pointer 指向 next dependent 的 instruction. 且 (%add) 的 result 沒有 API 可以 Get. 雖然可以用 iterator 從最後一個 Instruction 把每個 Instruction 的關係找出來. 但這樣好像有點暴力...

%add = add nsw i32 %tmp3, %tmp1
%shr = ashr i32 %add, 1

還好有提早發現到這個問題,不然再繼續走下去.會更加吐血吧... project: https://github.com/funningboy/XVerilog/blob/master/main.cpp 目前完成 Node : 定義每個 hardware resource ex: reg, alu, mem .... Edge : 定義每個 hardware resource 彼此的相依關係 ex: prelist, nxtlist... Graph : 定義個Basic Blcok內的 Node, Edge. cycle check : 針對每個 instruction 的 Delay 來判斷是否要插入 reg schedule : ASAP/ALAP refs: TCE project: Co-design of application-specific processors with LLVM-based compilation support How do I get the result of an instruction?