learning plus: 8月 2010

2010年8月30日星期一

ARM Cortex Ax with NEON SIMD

ARM 除了 support Thumb SIMD之外,另一個針對大量Data傳輸所建立的指令集 NEON,可參考底下的 link 有詳細的說明. 其實說穿了就是 Hardware Parallel support.利用多個 operator 來同步執行..

Lect.10.arm soc.4 neon

View more presentations from sean chen.

底下就用 NEON 的概念來寫Parallel support,不過 compiler/Hardware 並沒有支援.所以還是只有自 High 而已...XD


#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

#define BUF_SIZE 4

struct ABC {
 uint8_t A; 
 uint8_t B;
 uint8_t C;
 
} ABC_BUF[BUF_SIZE];

struct XYZ {
 uint8_t X;
 uint8_t Y;
 uint8_t Z;
} XYZ_BUF[BUF_SIZE]= {
 {       0,      0,      0},
 {      25,     25,     25},
 {       3,      5,      9},
 {       7,      5,      3},
};

void C_XYZ2ABC(){
int i;
     for(i=0; i<BUF_SIZE; i++){
 ABC_BUF[i].A = XYZ_BUF[i].X + XYZ_BUF[i].Y + XYZ_BUF[i].Z;
 ABC_BUF[i].B = XYZ_BUF[i].X + XYZ_BUF[i].Y - XYZ_BUF[i].Z;
 ABC_BUF[i].C = XYZ_BUF[i].X - XYZ_BUF[i].Y - XYZ_BUF[i].Z;
     }
}

void ASSEMBLY_XYZ2ABC(){
int i;
uint8_t tA,tB,tC;

   for( i=0; i<BUF_SIZE; i++ ){
                    tA = (XYZ_BUF[i].X + XYZ_BUF[i].Y);
          ABC_BUF[i].A = (tA           + XYZ_BUF[i].Z);

                    tB = (XYZ_BUF[i].X + XYZ_BUF[i].Y);
   ABC_BUF[i].B = (tB           - XYZ_BUF[i].Z);

                    tC = (XYZ_BUF[i].X - XYZ_BUF[i].Y);
          ABC_BUF[i].C = (tC           - XYZ_BUF[i].Z);
  }
}


uint32_t Exp4x8uint_8(uint8_t i24,uint8_t i16,uint8_t i8,uint8_t i0){
uint32_t r;
         r  = i24<<24;
         r += i16<<16;
         r += i8<<8;
         r += i0;
return r;
}

uint32_t Exe4x8uint_32(uint32_t a32,uint32_t b32,uint8_t op){
uint32_t r; 
uint8_t  ra24 = a32>>24;
uint8_t  ra16 = a32>>16;
uint8_t  ra8  = a32>>8;
uint8_t  ra0  = a32;

uint8_t  rb24 = b32>>24;
uint8_t  rb16 = b32>>16;
uint8_t  rb8  = b32>>8;
uint8_t  rb0  = b32;

switch(op){
   case 0: ra24 += rb24; ra16 += rb16; ra8 += rb8; ra0 += rb0; break; 
   case 1: ra24 -= rb24; ra16 -= rb16; ra8 -= rb8; ra0 -= rb0; break;
 }
 
 r  = ra24<<24;
 r += ra16<<16;
 r += ra8<<8;
 r += ra0;
 
 return r;
}

void NEON_XYZ2ABC(){
int i;
uint32_t X,Y,Z;
uint32_t A,B,C;

  for( i=0; i<BUF_SIZE; i=i+4 ){
 X = Exp4x8uint_8(XYZ_BUF[i+3].X, XYZ_BUF[i+2].X, XYZ_BUF[i+1].X, XYZ_BUF[i ].X);
 Y = Exp4x8uint_8(XYZ_BUF[i+3].Y, XYZ_BUF[i+2].Y, XYZ_BUF[i+1].Y, XYZ_BUF[i ].Y);
 Z = Exp4x8uint_8(XYZ_BUF[i+3].Z, XYZ_BUF[i+2].Z, XYZ_BUF[i+1].Z, XYZ_BUF[i ].Z);

        A  = Exe4x8uint_32(X,Y,0);
        A  = Exe4x8uint_32(A,Z,0);

        B  = Exe4x8uint_32(X,Y,0);
        B  = Exe4x8uint_32(B,Z,1);

        C  = Exe4x8uint_32(X,Y,1);
        C  = Exe4x8uint_32(C,Z,1);

        ABC_BUF[i+3].A = A>>24; ABC_BUF[i+2].A = A>>16;  ABC_BUF[i+1].A = A>>8; ABC_BUF[i].A = A; 
        ABC_BUF[i+3].B = B>>24; ABC_BUF[i+2].B = B>>16;  ABC_BUF[i+1].B = B>>8; ABC_BUF[i].B = B; 
        ABC_BUF[i+3].C = C>>24; ABC_BUF[i+2].C = C>>16;  ABC_BUF[i+1].C = C>>8; ABC_BUF[i].C = C; 
   }
}


void Display_ABC(){
int i; 
   for( i=0; i<BUF_SIZE; i++ ){
    printf("I     :: %3x,",i);
    printf("A[%2d]:: %3x,",i,ABC_BUF[i].A);
    printf("B[%2d]:: %3x,",i,ABC_BUF[i].B);
    printf("C[%2d]:: %3x,\n",i,ABC_BUF[i].C);
  }
  printf("\n");  
}

int main(int argc,char *argv[]){
 
 printf("C Code Result...\n");
 C_XYZ2ABC();
 Display_ABC();

 printf("NEON Result...\n");
 NEON_XYZ2ABC();
 Display_ABC();


return 0;
}

code download here Refs: RGB 2 YUV Optimizing Code for ARM Cortex-A8 with NEON SIMD

2010年8月29日星期日

在Bus Architecture 架構下, 有"Bus Matrix", "Share Bus","Net-Work on Chip Bus", 其中以 Share Bus 跟 Net-Work on chip 為主流架構. Share Bus 代表的有 ARM(AMBA), IBM(CoreConnect)...,而Net-Work on chip感覺目前還沒有業界support,一般還是在學術界比較普遍. 當然隨著進入多Cores 的世代 Net-Work on Chip 相對的就更有研究價值. Net-Work on Chip Advantages: 1. Faults recover 解決 Net Work Faults 的問題. 透過周圍有效的 Net 來做communication 如果用 Share Bus 的架構, Fault 會把 Bus 上的 Data tied住, 導致在這條Bus上的傳輸都會有問題. 2.Bandwidth/Performance enhancement利用不同的Nets 來減少 Bus Access dead Lock 時所造成的 latency , 如果是 Share Bus上,要等到Transfer完成 ,Bus usage 被 Release, 才能做下次傳輸.但 Net-Work on chips 可透過周圍有效的Nets 來彌補這個問題. 3. Power enhancement 透過Power Cluster 來動態調整每個 Net-Works 的Status. Disadvantages:1. HW cost 硬體設計的複雜度, 跟 Arbiter 數目 2. Utility 可能有些的Net-Works使用率不高,則有些使用率高.

為了避免dead lock, 如所以每個Node在決定下個Node前.要先判斷Direction是否離 End Node 愈來愈近. 如果沒有就找下個 Node.



#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <list>

using namespace std;

#define MAX_NET_NODE 12 
#define BEGIN_NODE 0 
#define END_NODE 3

enum NETSTATUS{
 NET_NXT_OK =0,
 NET_NXT_FAL =1,
 NET_TRACE_OK =2,
 NET_TRACE_FAL =3,
};

struct NetPath {
 int Id;
 int Weight;    //delay
} NetPath;

struct NeNode {
 int x;
 int y;
 int s;
        struct  NetPath  NetPath[MAX_NET_NODE];
}NetNode[MAX_NET_NODE];


 
void SetInitNetNodeTable(){
 // Net_0 set
 NetNode[0].x=0;
 NetNode[0].y=0; 
 NetNode[0].NetPath[0].Id     =1; NetNode[0].NetPath[0].Weight =3; 
        NetNode[0].NetPath[1].Id     =4; NetNode[0].NetPath[1].Weight =5;
  NetNode[0].s=2;
 // Net_1 set
 NetNode[1].x=0;
 NetNode[1].y=1; 
 NetNode[1].NetPath[0].Id     =0; NetNode[1].NetPath[0].Weight =5;
 NetNode[1].NetPath[1].Id     =2; NetNode[1].NetPath[1].Weight =5;
 NetNode[1].NetPath[2].Id     =5; NetNode[1].NetPath[2].Weight =5;
  NetNode[1].s=3;
 // Net_2 set
 NetNode[2].x=0;
 NetNode[2].y=2; 
 NetNode[2].NetPath[0].Id     =1; NetNode[2].NetPath[0].Weight =7;
 NetNode[2].NetPath[1].Id     =3; NetNode[2].NetPath[1].Weight =5;
 NetNode[2].NetPath[2].Id     =6; NetNode[2].NetPath[2].Weight =5;
 NetNode[2].s=3;
 // Net_3 set
 NetNode[3].x=0;
 NetNode[3].y=3; 
 NetNode[3].NetPath[0].Id     =2; NetNode[3].NetPath[0].Weight =4;
 NetNode[3].NetPath[1].Id     =7; NetNode[3].NetPath[1].Weight =3;
 NetNode[3].s=2;
 // Net_4 set
 NetNode[4].x=1;
 NetNode[4].y=0; 
 NetNode[4].NetPath[0].Id     =0; NetNode[4].NetPath[0].Weight =3;
 NetNode[4].NetPath[1].Id     =5; NetNode[4].NetPath[0].Weight =3;
 NetNode[4].NetPath[2].Id     =8; NetNode[4].NetPath[1].Weight =9;
 NetNode[4].s=3;
 // Net_5 set
 NetNode[5].x=1;
 NetNode[5].y=1; 
 NetNode[5].NetPath[0].Id     =1; NetNode[5].NetPath[0].Weight =5;
 NetNode[5].NetPath[1].Id     =4; NetNode[5].NetPath[1].Weight =6;
 NetNode[5].NetPath[2].Id     =6; NetNode[5].NetPath[2].Weight =8;
 NetNode[5].NetPath[3].Id     =9; NetNode[5].NetPath[3].Weight =4;
 NetNode[5].s=4;
 // Net_6 set
 NetNode[6].x=1;
 NetNode[6].y=2; 
 NetNode[6].NetPath[0].Id     =2; NetNode[6].NetPath[0].Weight =5;
 NetNode[6].NetPath[1].Id     =5; NetNode[6].NetPath[1].Weight =5;
 NetNode[6].NetPath[2].Id     =7; NetNode[6].NetPath[2].Weight =5;
 NetNode[6].NetPath[3].Id     =10; NetNode[6].NetPath[3].Weight =5;
 NetNode[6].s=4;
 // Net_7 set
 NetNode[7].x=1;
 NetNode[7].y=3; 
 NetNode[7].NetPath[0].Id     =3; NetNode[7].NetPath[0].Weight =4;
 NetNode[7].NetPath[1].Id     =6; NetNode[7].NetPath[1].Weight =4;
 NetNode[7].NetPath[2].Id     =11; NetNode[7].NetPath[2].Weight =4;
 NetNode[7].s=3;
 // Net_8 set
 NetNode[8].x=2;
 NetNode[8].y=0; 
 NetNode[8].NetPath[0].Id     =4; NetNode[8].NetPath[0].Weight =5;
 NetNode[8].NetPath[1].Id     =9; NetNode[8].NetPath[1].Weight =4;
 NetNode[8].s=2;
 // Net_9 set
 NetNode[9].x=2;
 NetNode[9].y=1; 
 NetNode[9].NetPath[0].Id     =5; NetNode[9].NetPath[0].Weight =5;
 NetNode[9].NetPath[1].Id     =8; NetNode[9].NetPath[1].Weight =7;
 NetNode[9].NetPath[2].Id     =10; NetNode[9].NetPath[2].Weight =6;
 NetNode[9].s=3;
 // Net_10 set
 NetNode[10].x=2;
 NetNode[10].y=2; 
 NetNode[10].NetPath[0].Id     =6; NetNode[10].NetPath[0].Weight =7;
 NetNode[10].NetPath[1].Id     =9; NetNode[10].NetPath[1].Weight =7;
 NetNode[10].NetPath[2].Id     =11; NetNode[10].NetPath[2].Weight =7;
 NetNode[10].s=3;
 // Net_11 set
 NetNode[11].x=2;
 NetNode[11].y=3; 
 NetNode[11].NetPath[0].Id     =6; NetNode[10].NetPath[0].Weight =7;
 NetNode[11].NetPath[1].Id     =11; NetNode[10].NetPath[1].Weight =7;
 NetNode[11].s=2;
}

int FindNodeId2xy(int id, int *x, int *y){
     switch(id){
       case 0 : *x = 0; *y=0; return NET_TRACE_OK;  break;
       case 1 : *x = 0; *y=1; return NET_TRACE_OK;  break;
       case 2 : *x = 0; *y=2; return NET_TRACE_OK;  break;
       case 3 : *x = 0; *y=3; return NET_TRACE_OK;  break;
       case 4 : *x = 1; *y=0; return NET_TRACE_OK;  break;
       case 5 : *x = 1; *y=1; return NET_TRACE_OK;  break;
       case 6 : *x = 1; *y=2; return NET_TRACE_OK;  break;
       case 7 : *x = 1; *y=3; return NET_TRACE_OK;  break;
       case 8 : *x = 2; *y=0; return NET_TRACE_OK;  break;
       case 9 : *x = 2; *y=1; return NET_TRACE_OK;  break;
       case 10: *x = 2; *y=2; return NET_TRACE_OK;  break;
       case 11: *x = 2; *y=3; return NET_TRACE_OK;  break;
       default:               return NET_TRACE_FAL; break;
   }
}

int xPathCot;
int yPathCot;
list<int> vistlist;

int FindNxtNode(int begin, int end, int xpath, int ypath){
   int i;
   int id;
   int bx,by;
   int tx,ty;
   int ixpath=xpath,iypath=ypath;
   
vistlist.push_back(begin);

 if( begin==end ){  
     for(list<int>::iterator it=vistlist.begin(); it!=vistlist.end(); it++){ cout<<*it<<","; } cout<<endl; 
      vistlist.pop_back();
      return 0; 
} 

 if( FindNodeId2xy(begin, &bx, &by) == NET_TRACE_FAL ){ return -1;} 

    for(i=0; i<NetNode[begin].s; i++){
             id = NetNode[begin].NetPath[i].Id;
         if( FindNodeId2xy(id, &tx, &ty) == NET_TRACE_FAL ){ break; return -1;} 
                 tx -=bx;
                 ty -=by;

                  if( xpath >0 && tx>0 ){ /*printf("a,%d,%d,%d,%d\n",begin,id,ixpath,iypath);*/  FindNxtNode(id,end,ixpath-1,iypath); }
             else if( xpath <0 && tx<0 ){ /*printf("b,%d,%d,%d,%d\n",begin,id,ixpath,iypath);*/  FindNxtNode(id,end,ixpath+1,iypath); }
             else if( ypath >0 && ty>0 ){ /*printf("c,%d,%d,%d,%d\n",begin,id,ixpath,iypath);*/  FindNxtNode(id,end,ixpath  ,iypath-1); }
             else if( ypath <0 && ty<0 ){ /*printf("d,%d,%d,%d,%d\n",begin,id,ixpath,iypath);*/  FindNxtNode(id,end,ixpath  ,iypath+1); }
     }
vistlist.pop_back();
}


int main(int argc,char* argv[]){

SetInitNetNodeTable();

int Beginx,Beginy;
int Endx,Endy;

if( FindNodeId2xy(BEGIN_NODE,&Beginx,&Beginy)==NET_TRACE_FAL ){ printf("@ Begin Node 2 MapTable Error ...\n"); return -1; }
if( FindNodeId2xy(END_NODE,  &Endx,  &Endy  )==NET_TRACE_FAL ){ printf("@ End   Node 2 MapTable Error ...\n"); return -1; }

xPathCot = Endx-Beginx;
yPathCot = Endy-Beginy;

FindNxtNode(BEGIN_NODE,END_NODE,xPathCot,yPathCot);
return 0;
}

Refs: Network On Chip

Network-on-Chip @ Google group

有關 network on chip 的學術文章

2010年8月26日星期四

uBoot Case Study @ omap3

在Booot的流程中, boot-loader 會先做些簡單的初始化, 如CPU/RAM set...and load Kernel imag to RAM,之後把使用權還給 Kernel build.會根據現有的Architecture 載入相對應的Driver 跟 configure 載入 root files,最後 run script 建立起 Application. 底下針對 u-boot-2010.06 OMP cotexA8 @beagle 來做說明. Step1. 在 uBoot 的 Flow 上中要先定義好 CPU && Board 的 Type 1.ARC -> CPU config ARM, X86, MIPS... 2.ARC -> Board config TI, ASUS... 這邊假設是用 CPU @ u-boot-2010.06/arch/arm/cpu/arm_cortexa8 Board @ u-boot-2010.06/board/ti/beagle Step2. uBoot loaded && LOW level set @ Arc/CPU 在 u-boot.lds 中會定義 bootload 的起始位置跟Entry point@ 0x00000000. and entry @ arm_cortexa8/start.o Ref: u-boot.lds

OUTPUT_FORMAT("elf32-littlearm", "elf32-littlearm", "elf32-littlearm")
OUTPUT_ARCH(arm)
ENTRY(_start)
SECTIONS
{
        . = 0x00000000;

        . = ALIGN(4);
        .text   :
        {
                arch/arm/cpu/arm_cortexa8/start.o       (.text)
                *(.text)
        }

而在 start.s 中,主要透過Assembly code 來設定 @ARM CPU 基本的function Mode,跟load boot-loader 到 RAM 的動作. start.s

.globl _start
_start: b       reset
        ldr     pc, _undefined_instruction
        ldr     pc, _software_interrupt
        ldr     pc, _prefetch_abort
        ldr     pc, _data_abort
        ldr     pc, _not_used
        ldr     pc, _irq
        ldr     pc, _fiq

_undefined_instruction: .word undefined_instruction
_software_interrupt:    .word software_interrupt
_prefetch_abort:        .word prefetch_abort
_data_abort:            .word data_abort
_not_used:              .word not_used
_irq:                   .word irq
_fiq:                   .word fiq
_pad:                   .word 0x12345678 /* now 16*4=64 */

lowlevel_init.S 設定 hold chip 的 Freq. 包含 PLL set, Divide set, Mux set, Memory set, and PLL lock wait(等待 PLL 能穩定).

.globl lowlevel_init
lowlevel_init:
        ldr     sp, SRAM_STACK
        str     ip, [sp]        /* stash old link register */
        mov     ip, lr          /* save link reg across call */
        bl      s_init          /* go setup pll, mux, memory */
        ldr     ip, [sp]        /* restore save ip */
        mov     lr, ip          /* restore link reg */

        /* back to arch calling code */
        mov     pc, lr
....
pll_ctl_add:
        .word CM_CLKEN_PLL
pll_div_add1:
        .word CM_CLKSEL1_PLL

cache.S 中設定 Icache(instruction cache) && Dcache(data cache). 如 cache enable, cache disable, cache flush...

l2_cache_enable:
        stmfd   r13!, {r0, r1, r2, lr}
        @ ES2 onwards we can disable/enable L2 ourselves
        bl      get_cpu_rev
        cmp     r0, #CPU_3XX_ES20
        blt     l2_cache_disable_EARLIER_THAN_ES2
        mrc     15, 0, r3, cr1, cr0, 1
        orr     r3, r3, #2
        mcr     15, 0, r3, cr1, cr0, 1
        b       l2_cache_enable_END

step3. other sets GPIO, sys_info, syslib... 其實就是分別依序填入control value 到相對應的"control Register",能讓系統能做到簡單的work,之後在load kernel image, and build.... Refs : ARM cortexa8 beagle Refs: Das U-Boot -- the Universal Boot Loader Inside the Linux boot process Booting 基于S3C2410的Linux全线移植文档作者：dozec U-Boot Quick Reference

AMBA 4.0 AXI Bus Pt2

接續　AMBA 4.0 AXI Bus Pt1, 底下用 @c code 來模擬 AMBA 4.0 out-of-order的形式. 主要區分成 Address Phase, Read-Data Phase. 透過 sleep delay 來模擬 Slave 快速跟慢速的 Response. out_of_order.c


#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

#define MAX_BUFF_DEEP 4 
#define MAX_BURST 4 
#define MAX_TEST_COT 8 

pthread_mutex_t count_mutex     = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t condition_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t  condition_cond  = PTHREAD_COND_INITIALIZER;

enum BUS_STATUS {
  BUS_TRUE        =0,
 BUS_FALSE =1,
};

enum BUS_ADDR_MAP {
 BUS_SLAVE_1_ST = 0x00000100,
 BUS_SLAVE_1_ED = 0x000001ff,
 BUS_SLAVE_2_ST = 0x00000200,
 BUS_SLAVE_2_ED = 0x000002ff,
 BUS_MASTER_1_ST = 0x00000300,
 BUS_MASTER_1_ED = 0x000003ff,
 BUS_MASTER_2_ST = 0x00000400,
 BUS_MASTER_2_ED = 0x000004ff,
};

typedef struct AddrBus {
 int FmAddr[MAX_BUFF_DEEP];
 int ToAddr[MAX_BUFF_DEEP];
 int INX;
 int FULL;
 int EMPTY;
} AddrBusBuf;

typedef struct RdData {
 int Data[MAX_BURST];
} RdDataBuf;


typedef struct RdDataBus {
 RdDataBuf   Data[MAX_BUFF_DEEP];
 int  FmAddr[MAX_BUFF_DEEP];
 int  INX;
 int  FULL;
 int  EMPTY; 
} RdDataBusBuf;

int TEST_COT=0;

AddrBusBuf AddrBusBufPtr;
RdDataBusBuf RdDataBusBufPtr;


void  AddrBusBuf_Initial(){
      AddrBusBufPtr.INX  =0;
      AddrBusBufPtr.FULL = BUS_FALSE;
      AddrBusBufPtr.EMPTY= BUS_TRUE;
}

void  RdDataBusBuf_Initial(){
      RdDataBusBufPtr.INX  =0;
      RdDataBusBufPtr.FULL = BUS_FALSE;
      RdDataBusBufPtr.EMPTY= BUS_TRUE;
}

void SetAddrBusBufStatus(int FmAddr,int ToAddr){
     int inx = AddrBusBufPtr.INX;
     AddrBusBufPtr.FmAddr[inx] =  FmAddr;
     AddrBusBufPtr.ToAddr[inx] =  ToAddr;
     inx++;
     AddrBusBufPtr.INX = inx;
}

void GetAddrBusBufStatus(){
     int inx = AddrBusBufPtr.INX;
     int i;     
    for( i=0; i<inx; i++){
      AddrBusBufPtr.FmAddr[i] = AddrBusBufPtr.FmAddr[i+1];
      AddrBusBufPtr.ToAddr[i] = AddrBusBufPtr.ToAddr[i+1];
   }
      inx--;
      AddrBusBufPtr.INX = inx;
}

void CheckAddrBusBufStatus(){
     AddrBusBufPtr.FULL = ( AddrBusBufPtr.INX==MAX_BUFF_DEEP )? BUS_TRUE : BUS_FALSE;
     AddrBusBufPtr.EMPTY= ( AddrBusBufPtr.INX==0             )? BUS_TRUE : BUS_FALSE;
}


void GetRdDataBusBufStatus(int n){
    int inx = RdDataBusBufPtr.INX;
    int i;
      printf("M%d Receive @ FmAdder: %x\n",n, RdDataBusBufPtr.FmAddr[0] ); 
 
   for( i=0; i<MAX_BURST; i++ ){
      printf("M%d Receive @ %d, %d\n",n, i, RdDataBusBufPtr.Data[0].Data[i] );
    }
   //update RdDataBusBuf 
   for( i=0; i<inx; i++){
       RdDataBusBufPtr.FmAddr[i] = RdDataBusBufPtr.FmAddr[i+1];
       RdDataBusBufPtr.Data[i]   = RdDataBusBufPtr.Data[i+1];
    }   
       inx--;
       RdDataBusBufPtr.INX = inx;
}

void SetRdDataBusBufStatus(int FmAddr,int n){
     int i;
     int inx = RdDataBusBufPtr.INX;
     
      for(i=0; i<MAX_BURST; i++){
             RdDataBusBufPtr.Data[inx].Data[i] = (n==1)? 10+i : 100+i;
       }
             RdDataBusBufPtr.FmAddr[inx] = FmAddr;
      inx++; 
      RdDataBusBufPtr.INX = inx;
}

 
void CheckRdDataBusBufStatus(){
     RdDataBusBufPtr.FULL = ( RdDataBusBufPtr.INX==MAX_BUFF_DEEP )? BUS_TRUE : BUS_FALSE;
     RdDataBusBufPtr.EMPTY= ( RdDataBusBufPtr.INX==0             )? BUS_TRUE : BUS_FALSE;
}


void *MASTER_1_Transmitter(void *t){
     long my_id = (long)t;
     int FmAddr,ToAddr;

     while(TEST_COT<MAX_TEST_COT){
      int cot =10;
          
         while( AddrBusBufPtr.FULL == BUS_TRUE ){
             sleep(1);
             if( cot==0 ){ printf("out-of-Time-Wait ... @ M1 Transmitter 4 Addr Phase\n"); break; }
             cot--;
         } 

         if( cot>0 ){
            pthread_mutex_lock(&count_mutex);
            FmAddr = BUS_MASTER_1_ST;
            ToAddr = ( TEST_COT%2==0 )? BUS_SLAVE_1_ST : BUS_SLAVE_2_ST;
            SetAddrBusBufStatus( FmAddr, ToAddr);
            CheckAddrBusBufStatus();
            printf("M1 Transmit @ FmAddr: %x, ToAddr %x\n", FmAddr, ToAddr);    
            pthread_mutex_unlock(&count_mutex);
         }

         sleep(1);
     }

pthread_exit(NULL);
}

void *MASTER_1_Receiver(void *t){
    long my_id = (long)t;
 
    while(TEST_COT<MAX_TEST_COT){
    int  cot   =10;
     
       while( RdDataBusBufPtr.EMPTY == BUS_TRUE ){
          sleep(1);
          if( cot==0 ){ printf("out-of-Time-Wait ... @ M1 Receiver 4 RdData Phase\n"); break; }
          cot--;  
     }
     
     int i,inx;
     if( cot>0 && BUS_MASTER_1_ST <= RdDataBusBufPtr.FmAddr[0] && RdDataBusBufPtr.FmAddr[0] <= BUS_MASTER_1_ED ){
         pthread_mutex_lock(&count_mutex);
         GetRdDataBusBufStatus(1);
         CheckRdDataBusBufStatus();
         TEST_COT++;
         pthread_mutex_unlock(&count_mutex);                
     }

     sleep(1);
   }
 
pthread_exit(NULL);
}


void *SLAVE_1_DO(void *t){
    long my_id = (long)t;
    int  FmAddr;
  
    while(TEST_COT<MAX_TEST_COT){
     int cot =10;

       while( RdDataBusBufPtr.FULL == BUS_TRUE && AddrBusBufPtr.EMPTY == BUS_TRUE ){
          sleep(2);
          if( cot==0 ){ printf("out-ot-Time-Wait ... @ S1 Receive 4 RdData/Addr Phase\n"); break; }
          cot --;
       }

       if( cot>0  && BUS_SLAVE_1_ST <= AddrBusBufPtr.ToAddr[0] && AddrBusBufPtr.ToAddr[0] <= BUS_SLAVE_1_ED ){
          pthread_mutex_lock(&count_mutex);
          FmAddr = AddrBusBufPtr.FmAddr[0]; 
          printf("S1 Receive Req @ FmAddr: %x\n", FmAddr);
          SetRdDataBusBufStatus( FmAddr, 1);
          GetAddrBusBufStatus();
          CheckAddrBusBufStatus();
          CheckRdDataBusBufStatus();
          pthread_mutex_unlock(&count_mutex);
      }

     sleep(3);  
   }
 pthread_exit(NULL);
}

void *SLAVE_2_DO(void *t){
    long my_id = (long)t;
    int  FmAddr;
  
    while(TEST_COT<MAX_TEST_COT){
     int cot =10;

       while( RdDataBusBufPtr.FULL == BUS_TRUE && AddrBusBufPtr.EMPTY == BUS_TRUE ){
          sleep(2);
          if( cot==0 ){ printf("out-ot-Time-Wait ... @ S2 Receive 4 RdData/Addr Phase\n"); break; }
          cot --;
       }

       if( cot>0  && BUS_SLAVE_2_ST <= AddrBusBufPtr.ToAddr[0] && AddrBusBufPtr.ToAddr[0] <= BUS_SLAVE_2_ED ){
          pthread_mutex_lock(&count_mutex);
          FmAddr = AddrBusBufPtr.FmAddr[0]; 
          printf("S2 Receive Req @ FmAddr: %x\n", FmAddr);
          SetRdDataBusBufStatus( FmAddr, 2);
          GetAddrBusBufStatus();
          CheckAddrBusBufStatus();
          CheckRdDataBusBufStatus();
          pthread_mutex_unlock(&count_mutex);
      }

     sleep(5);  
   }
 pthread_exit(NULL);
}

int main(int argc,char* argv[]){
 AddrBusBuf_Initial();
 RdDataBusBuf_Initial();
 
 pthread_t thread[4];

 pthread_create( &thread[0],NULL, MASTER_1_Transmitter, NULL);
 pthread_create( &thread[1],NULL, MASTER_1_Receiver, NULL);
 pthread_create( &thread[2],NULL, SLAVE_1_DO, NULL);
 pthread_create( &thread[3],NULL, SLAVE_2_DO, NULL);
                      
 pthread_join( thread[0],NULL);
 pthread_join( thread[1],NULL);
 pthread_join( thread[2],NULL);
 pthread_join( thread[3],NULL);

 pthread_exit(NULL);

return 0;
}

Results: root@sean-laptop:/home/sean/prj/SOC_c_model/out_of_order# ./a.out M1 Transmit @ FmAddr: 300, ToAddr 100 S1 Receive Req @ FmAddr: 300 M1 Transmit @ FmAddr: 300, ToAddr 100 M1 Receive @ FmAdder: 300 M1 Receive @ 0, 10 M1 Receive @ 1, 11 M1 Receive @ 2, 12 M1 Receive @ 3, 13 M1 Transmit @ FmAddr: 300, ToAddr 200 S1 Receive Req @ FmAddr: 300 M1 Receive @ FmAdder: 300 M1 Receive @ 0, 10 M1 Receive @ 1, 11 M1 Receive @ 2, 12 M1 Receive @ 3, 13 M1 Transmit @ FmAddr: 300, ToAddr 100 M1 Transmit @ FmAddr: 300, ToAddr 100 M1 Transmit @ FmAddr: 300, ToAddr 100 S2 Receive Req @ FmAddr: 300 M1 Receive @ FmAdder: 300 M1 Receive @ 0, 100 M1 Receive @ 1, 101 M1 Receive @ 2, 102 M1 Receive @ 3, 103 S1 Receive Req @ FmAddr: 300 M1 Transmit @ FmAddr: 300, ToAddr 200 M1 Receive @ FmAdder: 300 M1 Receive @ 0, 10 M1 Receive @ 1, 11 M1 Receive @ 2, 12 M1 Receive @ 3, 13 sample code download here Summary AXI vs AHB

2010年8月24日星期二

AMBA 4.0 AXI Bus Pt1

在learning plus: AMBA 4 ARM ,learning plus: AHB Platform emulator @ SystemC,learning plus: ESL Design Flow 中我們有介紹 AMBA 2.0 AHB跟APB 的 Bus 架構,跟 SystemC Module @ AHB,APB 的 virtual platform .不過 ARM 在 AMBA 3.0之後為了要增加 Bus 的效能,而改變了先前 AMBA 2.0 的 protocol,主要在於 AMBA 2.0 採用的是 in-order 機制,就是Data傳輸完後 Bus 的使用權才會被Release.表示一組Transfer是不能被中斷的,相對的Bus會一值被Lock.導致整個系統的 performance會被慢速的　Transfer　給影響,所以在 AMBA 3.0 之後採用　out-of-order　的概念做不對等的傳輸方式,表示在每組　Transfer 可根據自己本身速度的快慢來 Access Bus. Ex: AMBA 2.0 Burst(4) type M1@33Mhz, M2@66Mhz SEQ : M1->M2 M1(0), M1(1), M1(2), M1(3), M2(0), M2(1), M2(2), M2(3) 要等 M1 完成, M2 才開始 Ex: AMBA >=3.0 Burst(4) type M1@33Mhz, M2@66Mhz SEQ : M1->M2 M2(0), M2(1), M2(2), M2(3),M1(0),M1(1) M1(2), M2(3) 取決於 M1, M2 的速度,不必等 M1 完成, M2 即可開始當然這除了硬體要支援sort跟reorder機制,跟 Address Phase 和 Data Phase 完全分開. 　

底下用sample code來模擬 in-order 跟　out-of-order的差別 in_order.c


#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

#define MAX_BURST 4 
#define MAX_TEST_COT 3

pthread_mutex_t count_mutex     = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t condition_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t  condition_cond  = PTHREAD_COND_INITIALIZER;

enum BUS_STATUS {
  BUS_ON_BUSY =0,
 BUS_OFF_BUSY =1,
 BUS_ON_LOCK =2,
 BUS_OFF_LOCK =3,
};

enum BUS_ADDR_MAP {
 BUS_SLAVE_1_ST = 0x00000100,
 BUS_SLAVE_1_ED = 0x000001ff,
 BUS_SLAVE_2_ST = 0x00000200,
 BUS_SLAVE_2_ED = 0x000002ff,
 BUS_MASTER_1_ST = 0x00000300,
 BUS_MASTER_1_ED = 0x000003ff,
};

typedef struct Bus {
 int FmAddr;
 int ToAddr;
 int Data[MAX_BURST];
 int Busy;
 int Lock;
} Bus_Buf;


Bus_Buf BusPtr;
int TEST_COT=0;

void Bus_Initial(){
     BusPtr.Busy = BUS_OFF_BUSY;
     BusPtr.Lock = BUS_OFF_LOCK;
}

void *MASTER_1_Transmitter(void *t){
     long my_id = (long)t;

while( TEST_COT < MAX_TEST_COT ){   
    int  cot =10;
 
   while( BusPtr.Busy == BUS_ON_BUSY ){
     sleep(1);     
     if( cot==0 ){ printf("Out-of-Time-Wait M1 Transmitter retry...\n"); break; }
     cot--;
  }

 if( cot>0 ){
      printf("M1 Transmitter @ %d\n", TEST_COT);
      pthread_mutex_lock(&count_mutex);
      BusPtr.FmAddr = BUS_MASTER_1_ST;
      BusPtr.ToAddr = ( TEST_COT%2==0 )? BUS_SLAVE_1_ST: BUS_SLAVE_2_ST;
      BusPtr.Busy = BUS_ON_BUSY;
      pthread_mutex_unlock(&count_mutex);
 } else {
   sleep(2);
 }

 sleep(3);
}

pthread_exit(NULL);
}

void *MASTER_1_Receiver(void *t){
    long my_id = (long)t;
    int  cot   =10;

while( TEST_COT < MAX_TEST_COT ){   
 
  while( BusPtr.Busy == BUS_ON_BUSY && BusPtr.Lock == BUS_OFF_LOCK ){
    sleep(1);
    if( cot==0 ){ printf("Out-of-Time wait M1 Receiver retry...\n"); break; }
    cot--; 
  }
  
  int i;
  
 if( cot>0 && BUS_MASTER_1_ST <= BusPtr.FmAddr && BusPtr.FmAddr <= BUS_MASTER_1_ED ){
    pthread_mutex_lock(&count_mutex);
    BusPtr.Busy = BUS_OFF_BUSY;
    BusPtr.Lock = BUS_OFF_LOCK;
    for(i=0; i<MAX_BURST; i++){
      printf("M1 Receive Fm %d,%d,%d\n",BusPtr.ToAddr,i,BusPtr.Data[i]);
    }
    printf("M1 Reveive Done @ %d\n",TEST_COT);
    printf("\n");

    TEST_COT++;
    pthread_mutex_unlock(&count_mutex);

  } else {
    sleep(2);
  }

 sleep(3);
}

pthread_exit(NULL);
}

void *SLAVE_1_DO(void *t){
    long my_id = (long)t;

while( TEST_COT < MAX_TEST_COT ){   
  
 while( BusPtr.Busy == BUS_OFF_BUSY ){
    sleep(1);
 }   
 
 int i;
 if( BUS_SLAVE_1_ST <= BusPtr.ToAddr && BusPtr.ToAddr <= BUS_SLAVE_1_ED ){
     pthread_mutex_lock(&count_mutex);
     BusPtr.Lock = BUS_ON_LOCK;
     for(i=0; i<MAX_BURST; i++){ 
        BusPtr.Data[i] = i+10;
     } 
     pthread_mutex_unlock(&count_mutex);
 } else {
   sleep(3);
 }

sleep(3);
}

 pthread_exit(NULL);
}

void *SLAVE_2_DO(void *t){
    long my_id = (long)t;

while( TEST_COT < MAX_TEST_COT ){   
   
 while( BusPtr.Busy == BUS_OFF_BUSY ){
    sleep(1);
 }   
 
 int i;
 if( BUS_SLAVE_2_ST <= BusPtr.ToAddr && BusPtr.ToAddr <= BUS_SLAVE_2_ED ){
     pthread_mutex_lock(&count_mutex);
     BusPtr.Lock = BUS_ON_LOCK;
     for(i=0; i<MAX_BURST; i++){ 
        BusPtr.Data[i] = i+100;
     } 
     pthread_mutex_unlock(&count_mutex);
 } else {
   sleep(3);
 }

 sleep(3);
}

 pthread_exit(NULL);
}

int main(int argc,char* argv[]){
 Bus_Initial();
 
 pthread_t thread[4];

 pthread_create( &thread[0],NULL, MASTER_1_Transmitter, NULL);
 pthread_create( &thread[1],NULL, MASTER_1_Receiver, NULL);
 pthread_create( &thread[2],NULL, SLAVE_1_DO, NULL);
 pthread_create( &thread[3],NULL, SLAVE_2_DO, NULL);
                      
 pthread_join( thread[0],NULL);
 pthread_join( thread[1],NULL);
 pthread_join( thread[2],NULL);
 pthread_join( thread[3],NULL);

 pthread_exit(NULL);

return 0;
}

Results M1 Transmitter @ 0 M1 Receive Fm 256,0,10 M1 Receive Fm 256,1,11 M1 Receive Fm 256,2,12 M1 Receive Fm 256,3,13 M1 Reveive Done @ 0 M1 Transmitter @ 1 M1 Receive Fm 512,0,100 M1 Receive Fm 512,1,101 M1 Receive Fm 512,2,102 M1 Receive Fm 512,3,103 M1 Reveive Done @ 1 sample code download here Refs: ARM Technology 基于AMBA AXI总线的低功耗扩展设计 On Chip Communication Architectures AHB vs. AXI Burst. AHB Burst. Address and Data are locked together (single pipeline stage); HREADY controls intervals of address and data ...

2010年8月23日星期一

ARM vs X86

有鑑於 ARM 在 Embedded Market 如此成功,現在x86 的老大哥 Intel 也注意到這塊大餅的商機,畢竟 Cell Phone, Table PC, EE-PC,, 的汰換率高,不像桌機一用就是好幾年,這對 CPU 製造商而言, 每個世代的CPU所推出的時間就會被Delay,或者是消費者的購買慾望降低,相對的獲利就會大幅縮水..好像扯遠了... 底下找了一些 data,有興趣的人可以看看,大概說明 ARM 跟 X86 在Architecture 上的差異,跟未來 ARM 還有 X86的市場走向. 結語不難發現 ARM 也走向Multi cores的架構,而 Intel 在 Power Architecture 上下功夫,代表High Performance && Low Power 是這兩家公司主要的目標,分別截長補短來補足現有架構下的缺失. Refs : ARM® Instruction Set Quick Reference Card Intel fires opening salvo in x86 vs. ARM smartphone wars Choosing a CPU: x86 vs ARM Inside Intel's Atom Z600 series Intel Unveils Moorestown and the Atom Z600, The Fastest Smartphone Platform? x86 vs ARM Mobile CPUs PRESS KIT - Second-Generation Intel® Atom™ Processor-Based Platform for Handhelds

Intel® Atom™ Processor

Sharkv2 2010_06~08 profit reports pt2

接續 Sharkv2 2010_06~08 profit reports pt1 在之前我們已經把所有的解空間轉成Graph的型式,包含Edge跟Node的資料型態. 底下透過Boost C++ lib所提供的 DFS(deep first search) 來找出最多的交易次數 && 最少的交易次數,再分別比較每條交易曲線所各自代表的獲利跟交易次數的相關性,藉由不同的持股標的跟進出場時間來分析這一季或這段交易時間內的風險跟持股的周期. ps : Boost C++ 內support graphviz 的 input format, 可參考 boost_1_43_0/libs/graph/example/dfs-parenthesis.cpp. Refs: Boost build [boost] how to compile when using read_graphml linking error read_graphviz Google answer

2010年8月20日星期五

Sharkv2 2010_06~08 profit reports pt1

底下為 Sharkv2 在 2010_06~08的績效表. 可參考 Finance Lists下,我們之前發過的post... Doc definition: sheet (c1) 為首次出現買進訊號時的交易績效表 sheet (c2) 為連續出現兩次買進訊號時的交易績效表 sheet (c3) 為連續出現三次買進訊號時的交易績效表可以發現並不是連續買進訊號愈多代表交易績效愈好,因為我們這個是底部(頭)反轉系統,要確保股價已經打底完成,且做上漲準備時的交易區間.如果交易買進的連續訊號過多,表示底部尚未形成還有補跌的空間.所以適當的連續交易訊號才有一個較好的獲利區間. ps : 交易績效表下載但其實用剛剛的績效表只能顯示我們每次的進出場時間點跟全部的獲利情況,就時間跟空間軸而言還不算完整,因為時間是連續性的且我們的成本是固定的,代表在現有的分險中手中持股數目跟價錢是固定的.表示握有的股票不會同時持有不同的交易標的,當然這是假設每次交易只有單一持股的情況,不過似乎很少人會這麼做.any way,底下我們假設每次交易都只持有單一股票,且在時間軸上做交易的情形,除了可以分析我們持股的周期,持股的股性,持股的交易次數...最後可以得到一個時間跟獲利的交易曲線,利用這兩條交易曲線做cross 就可找出獲利區間是落在哪個地方,分別各自代表時間上的哪個部位,還有交易時的訊號情況(indicator,三大法人...) 欄位說明 ID,Entry Day,End Day, Entry Price, End Price, Profit 所以經由上述的說明我們可以找到所有的Case solutions,假設 case 1. E1 = 1102.TW,2010/06/09,2010/06/21,27.55,29.8,2.25 E2 = 1605.TW,2010/06/28,2010/07/06,12,12.85,0.85 E3 = 1210.TW,2010/07/13,2010/07/20,29.2,31.4,2.2 ... case 2. E1 = 1530.TW,2010/06/04,2010/06/24,36.3,39.8,3.5 E2 = 1435.TW,2010/06/30,2010/07/05,5.81,6.25,0.44 E3 = 1210.TW,2010/07/13,2010/07/20,29.2,31.4,2.2 ... case ... 經由timing Expand,我們可以找出我們在timing 區間上每次交易的Path組合(case),可能 case 1 Profit(30), Time(10). case 2 Profit(25), Time(4). 就可以發現在 case 1 雖然交易的獲利比較好,但進出次數過多,相對的風險也較大. case 2 雖然獲利較差,但進出場次數較少,相對的風險也較低,整體而言在case 2 的投資組合更勝 case 1. 可以利用 BFS/DFS Graph search 來尋找最大的獲利組合跟最少的獲利組合,再分析獲利組合下的交易次數,最後兩者做圖形的重疊,就可以找到我們所要的獲利區間.

2010年8月18日星期三

Lex & Yacc case study @ PLY

除了用 Perl/Python 做簡單的 regular expression 外,最常用到的就是 Lex && Yacc摟... Lex : Lexical Analyzar Yacc : Yet Another Compiler Compiler Lex Yacc 簡易介紹 Lex Yacc

Lex && Yacc flow

詳細請參考 A COMPACT GUIDE TO LEX & YACC 底下用 open source 的 Project 來完成我們要的Parser. 請先安裝 PLY (Python Lex-Yacc) input part

int main(){
a=1;
b=3;
c=a+b;

printf("%d\n",c);
return 0;
}

analysis part

import sys



# This is not required if you've installed pycparser into

# your site-packages/ with setup.py

#

sys.path.insert(0, '..')



from pycparser import c_parser, c_ast, parse_file

from pycparser.portability import printme





# A simple visitor for FuncDef nodes that prints the names and 

# locations of function definitions.

#



def show_func_defs(filename):

    # Note that cpp is used. Provide a path to your own cpp or 

    # make sure one exists in PATH.

    #
    ast = parse_file(filename, use_cpp=True)    

    ast.show()





if __name__ == "__main__":

    if len(sys.argv) > 1:

        filename  = sys.argv[1]

    else:

        filename = 'c_files/test.c'



    show_func_defs(filename)

output part

FileAST: 
  FuncDef: 
    Decl: main, [], []
      FuncDecl: 
        TypeDecl: main, []
          IdentifierType: ['int']
    Compound: 
      Assignment: =
        ID: a
        Constant: int, 1
      Assignment: =
        ID: b
        Constant: int, 3
      Assignment: =
        ID: c
        BinaryOp: +
          ID: a
          ID: b
      FuncCall: 
        ID: printf
        ExprList: 
          Constant: string, "%d\n"
          ID: c
      Return: 
        Constant: int, 0
main at ./c_files/test.c:3

Refs: C Parser (Front End) Introducing the Boost parser framework Spirit Parser Framework Lex and YACC primer/HOWTO Simple Top-Down Parsing in Python Parse::Eyapp::datagenerationtut @ Perl 2.2. Parse::Yapp Lex && Yacc Example

2010年8月17日星期二

Path dealy @ graphviz Pt1

graphviz 提供了Flow Chart(GUI) 讓使用者能夠快速的知道Data的flow跟每個struct 的相關性,底下用XML Viewer @ SOC當範例. tool environments step1. install graphviz lib from Graph Visualization Software step2. install GraphViz module from cpan

#!/usr/bin/perl -w
#
# A example which represents some XML as a tree

use strict;
use lib '../lib';
use GraphViz::XML;

my $xml =
'<Report>
 <Rpt_Note>
   <ReportType>timing</ReportType>
 </Rpt_Note>
 <PathList>
    <Path>
       <Path_Node>
           <StartPoint>xxx.xxx.xxx</StartPoint>
           <EndPoint>ccc.ccc.ccc</EndPoint>
           <DelayType>Max</DelayType>
       </Path_Node>
       <Point>
           <PointType>NetOrPort</PointType>
           <Name>xxx.xxx.xxx</Name>
           <Trigger>Rising</Trigger>
           <IncDelay>0.000</IncDelay>
           <TotalDelay>9.623</TotalDelay>
        </Point>
        <Point>
           <PointType>NetOrPort</PointType>
           <Name>ccc.ccc.ccc</Name>
           <Trigger>Falling</Trigger>
           <IncDelay>0.100</IncDelay>
           <TotalDelay>9.723</TotalDelay>
        </Point>
    </Path>
 </PathList>
</Report>';


my $graph = GraphViz::XML->new($xml);

$graph->as_png("xml.png");
#print $g->as_text;
#print $g->_as_debug;

pic XML spanning tree Refs:

XML Viewer @ SOC An Introduction to GraphViz

Graph Visualization Software

2010年8月15日星期日

Shortest Path Algorithm

pic 1 In the Gate Level design of Soc timing check is our major tasks, it define some tags and constrains for timing libraries, such as "worst", "typical", and "beset" cases for each gates (Nodes)increment time, and "setup time" / "hold time" for each timing paths had exist, that constructed by one flip-flop through gates to another flip-flop. so in pic 1, we can figure out the output flip-flop had it's own fan-in paths, and all of them should meet the setup and hold timing constrain, if not, the flip-flops would get the wrong data in time strobe, in setup time fail means the data is slower than arrived, that we called longest path, hold time means the data is faster than arrived,that called shortest path and shortest path. so in here we use the Shortest Path to find out the shortest path in our case. how to adjust the violation paths to meet the constrain? in setup timing violation, we usually add latch design in our violation path,because it can hold data in each transaction,it means if the path is (2T) and our timing constrain is (1T),so the timing check would be failed,if we add the Latch design in it, that can separate our path in two parts, in here we assume first path is 1,5T, and second path is 0.4T , so total path time is (1.5+0.4) =1.9T. and the function module of latch is as same as flip-flop, so the timing path would be redefine in 2T, so the timing path 1.9T is match 2.T constrain. hold timing violation, add delay buffer.....

2010年8月13日星期五

Minimum Spanning Tree Algorithm

In Soc Design, the timing source is a big problem, because it would influence the data through put rate and correction. in default Soc design, we would put the clock source in one side corner by it's routing tool definition, but the clock paths should from root to leafs, that could supply the whole chip clock to work, so how to keep the cleaning clock and lower hardware cost is our topic ,In here we used Minimum Spanning Tree Algorithm to solve it question. In pig 1 is our clock tree case, we extend it to each clock leaf clusters, if we define each cluster had it's own clock domain, like C1(533Mhz), C2 (266Mhz) , and each clock domain had it's own clock leaf nodes, so we can use Minimum Spanning Tree to find out our minimum clock source, that means we can get the shortest clock path from clock root(PLL), so the metal line is getting shorter and violation is reduced.

pic 1 in order to reduce power consumed, we could insert some clock gates in the path of clock tree by Latch gate, because the Latch is level sensitive design, it only change it's data by status changed, so it can filter out the violation in input and hold the data stable in output stage.

Connected Components Algorithm

fig 1 Purpose: Connected Components Algorithm application with SOC design Sample case Clock domain issue && Path extraction. constrain: Step1. if we defined the Nodes 1 ~ 4 are in the same clock domain,and each node had it's data path, that we called edge or vector, so we can easily to know the INPUT graph in fig 1, ex: Node 1 (input Node 3, output Node 2)...and so on Step2. If the connection and depended condition had already know, we can trace each node and find it's loop path,if it exist, it means we can cluster the Nodes of loop in same clock domain, and each data path can be sort. Step3. If the cluster had find, we can use the sample commands to extract our results faster and easily, such as STA(static timing analysis tool) it used "report_timing -to -max_slack -max_path ...", so we can use it in this case, firstly, find the same clock domain in one cluster, secondly, find the data path correlation in this cluster,and report the results.... otherwise you can write the different clock domain in different clock domain constrain. ps:Boost Lib is an easily way to do. just only define your graph conditions and edge paths and constrains, ex:

  boost::tie(e,flag) = add_edge(0, 1, G);
  ds.union_set(0,1);

Refs: Connected Components Algorithm Connected Components incremental_components @Boost c++ lib

2010年8月12日星期四

Booth Algorithm

Booth on-line test Booth's Algorithm sample code booth.cpp


#include <iostream>
#include <string>


#define BITSIZE 5 

using namespace std;

class Booth{
private:
 int a;
 string a_str;
 int b;
 string b_str;
 int c;
 string c_str;
 int bit;
public :
int CheckRange(int a){int i=( -32768<=a && a<=32767 )? 0:-1; return i; }

void SetBooth_a(int i){ a=i;      }
int GetBooth_a(){      return a; }

void SetBooth_a_str(int a){ a_str = Int2Bin(a); }
string  GetBooth_a_str(){      return a_str;       }

void SetBooth_b(int i){ b=i;      }
int GetBooth_b(){      return b; }

void SetBooth_b_str(int b){ b_str = Int2Bin(b); }
string  GetBooth_b_str(){      return b_str;       }

void SetBooth_c(int i){ c=i;      }
int GetBooth_c(){      return c; }

void SetBooth_c_str(int c,int b){ c_str = Int2Bin(c)+Int2Bin(b); }
string GetBooth_c_str(){      return c_str;       }

int SetBooth_Bit(int b){ bit=b; }

string GetBegin_str(string s){ string::iterator it;
                                string t;
                                it = s.begin();
                                t = *it;
                                return t;
                              }

string GetEnd_str(string s){ string::iterator it; 
          string t;
                               it = s.end()-1;
                               t = *it;
                               return t;
                             }

string Int2Bin(int number)
{
    string bins;
    int mask = 0x01;

    bins.resize(BITSIZE);

    for( int i = 0; i < BITSIZE; i++ )
         bins[BITSIZE-1-i] = ((number >> i) & mask )? '1' : '0';

    return bins;
}

void Pro_SHIFT_c_str(){
     string::iterator it;
     string str;
 
       str += *c_str.begin();
     for(it= c_str.begin(); it<c_str.end()-1; it++){
       str += *it; 
    }

     c_str=str;
     c = c >>1;
}

void Pro_ADD_A_c_str(){
   int r = c = c + a; 
   string  r_str = Int2Bin(r);

   string::iterator it;
   string s_str;
   for( it=c_str.begin()+BITSIZE; it<c_str.end(); it++ ){ 
        s_str += *it;
   }
   r_str += s_str;
   c_str = r_str;
}

void Pro_SUB_A_c_str(){
   int r = c = c -a; 
   string  r_str = Int2Bin(r);

   string::iterator it;
   string s_str;
   for( it=c_str.begin()+BITSIZE; it<c_str.end(); it++ ){ 
        s_str += *it;
   }
   r_str += s_str;
   c_str = r_str;
}

};



int main(int argc,char *argv[]){

Booth *BoothPtr = new Booth;

int a =2;
int b =-5;

if( BoothPtr->CheckRange(a) != 0 ){ 

cout<<"Out of Data Range in '-2^15 ~ 2^15-1' @a"<<endl;
return -1;
}
BoothPtr->SetBooth_a(a);
BoothPtr->SetBooth_a_str(a);


if( BoothPtr->CheckRange(b) != 0 ){ 
cout<<"Out of Data Range in '-2^15 ~ 2^15-1' @b"<<endl;
return -1;
}

BoothPtr->SetBooth_b(b);
BoothPtr->SetBooth_b_str(b);

BoothPtr->SetBooth_c(0);
BoothPtr->SetBooth_c_str(0,b);

string b_str = BoothPtr->GetBegin_str( BoothPtr->GetBooth_a_str() );

cout << "============================================"<<endl;
cout << "Step0.\tInitial a,b..." <<endl;  
cout << " a::\t" << BoothPtr->GetBooth_a() <<"\t"<< BoothPtr->GetBooth_a_str()<<endl;
cout << "xb::\t" << BoothPtr->GetBooth_b() <<"\t"<< BoothPtr->GetBooth_b_str()<<endl;
cout << "--------------------------------------------"<<endl;
cout << " c::\t" << BoothPtr->GetBooth_c_str() <<" "<< b_str<<endl;
cout << "============================================"<<endl;
cout << endl;

string r_str;

//cout << BoothPtr->Pro_SHIFT_c_str("1101");

for(int i=0; i<BITSIZE; i++){
      
      r_str = BoothPtr->GetEnd_str( BoothPtr->GetBooth_c_str() );  
      r_str += b_str;

          b_str = BoothPtr->GetEnd_str( BoothPtr->GetBooth_c_str() );

         if( r_str == "00" ){ cout << "Step"<< i+1 <<".\tSHIFT"<<endl; BoothPtr->Pro_SHIFT_c_str();                           }
    else if( r_str == "01" ){ cout << "Step"<< i+1 <<".\tADD_A"<<endl; BoothPtr->Pro_ADD_A_c_str();  BoothPtr->Pro_SHIFT_c_str(); } 
    else if( r_str == "10" ){ cout << "Step"<< i+1 <<".\tSUB_A"<<endl; BoothPtr->Pro_SUB_A_c_str();  BoothPtr->Pro_SHIFT_c_str(); }
    else if( r_str == "11" ){ cout << "Step"<< i+1 <<".\tSHIFT"<<endl;BoothPtr->Pro_SHIFT_c_str();                               }
    else{ cout <<" Booth Internal Error ..." << endl; return -1; } 
   
cout << " a::\t" << BoothPtr->GetBooth_a() <<"\t"<< BoothPtr->GetBooth_a_str()<<endl;
cout << " b::\t" << BoothPtr->GetBooth_b() <<"\t"<< BoothPtr->GetBooth_b_str()<<endl;
cout << " c::\t" << BoothPtr->GetBooth_c() <<"\t"<< BoothPtr->GetBooth_c_str()<<endl; 
cout << "--------------------------------------------"<<endl;
cout << " c::\t" << BoothPtr->GetBooth_c_str() <<" "<< b_str<<endl;
cout << "============================================"<<endl;
cout << endl;


}

cout << " c::\t" << "Bin::"<<BoothPtr->GetBooth_c_str() <<" "<< "Dec::"<<a*b<<endl;

return 0;
}

Results: ============================================ Step0. Initial a,b... a:: 2 00010 xb:: -5 11011 -------------------------------------------- c:: 0000011011 0 ============================================ Step1. SUB_A a:: 2 00010 b:: -5 11011 c:: -1 1111101101 -------------------------------------------- c:: 1111101101 1 ============================================ .... ============================================ Step5. SHIFT a:: 2 00010 b:: -5 11011 c:: -1 1111110110 -------------------------------------------- c:: 1111110110 1 ============================================ c:: Bin::1111110110 Dec::-10 code download here

ACPI @ Linux kernel

"Power", the big problem and issue in any electronic devices. How to save Power consumed is a good topic for us, in Hardware part, we would hear some Power technologies in usually, such as "Power Gating", "Power Management", "DVFS"... all of the targets is to turn off the unused hardware and keep the power consumed stability without charged or discharged usually , because each charged or discharged, it would waist some power to change it's status and dynamic power would get higher in this moment, so the template thermal would get higher, that would damaged our electronic devices. but in power saving view, when to turn off the devices and how to get the more efficient without thermal, power waist.. so in software part, it define the each power status, that means the power status had it's own job, like G1, Sleeping subdivides into the four states S1 through S4: S1: All processor caches are flushed, and the CPU(s) stop executing instructions. Power to the CPU(s) and RAM is maintained; devices that do not indicate they must remain on may be powered down. ... and we can use power status table and some predictions to determine the status paths and when to change..., it can be easily to handle and turn off unused devices efficiency. Refs: SmartReflex™ Power and Performance Management Tech... DVFS emulator peak power power management 4 Linux power monitor part2 power monitor part1 Advanced Configuration and Power Interface

2010年8月11日星期三

XML Viewer @ SOC

XML Interface, 大家應該不陌生, 最常用在檔案的傳輸跟資料的鍊結. 定義好 XML Encoder/ Decoder的 Type,剩下就是token的欄位要分別代表什麼參數了. 其實在Debussy(Verdi)的 schematic view中, 也是用XML的語法來連接內部的Data Based. Path example

<?xml version="1.0" encoding="UTF-8"?>
<Report>
 <Rpt_Note>
   <ReportType>timing</ReportType>
 </Rpt_Note>
 <PathList>
    <Path>
       <Path_Node>
           <StartPoint>xxx.xxx.xxx</StartPoint>
           <EndPoint>ccc.ccc.ccc</EndPoint>
           <DelayType>Max</DelayType>
       </Path_Node>
       <Point>
           <PointType>NetOrPort</PointType>
           <Name>xxx.xxx.xxx</Name>
           <Trigger>Rising</Trigger>
           <IncDelay>0.000</IncDelay>
           <TotalDelay>9.623</TotalDelay>
        </Point>
        <Point>
           <PointType>NetOrPort</PointType>
           <Name>ccc.ccc.ccc</Name>
           <Trigger>Falling</Trigger>
           <IncDelay>0.100</IncDelay>
           <TotalDelay>9.723</TotalDelay>
        </Point>
    </Path>
 </PathList>
</Report>

Refs: XML Use XML to build ASIC or SoC design specifications XML Parser @ Perl

2010年8月10日星期二

Behavior Synthesizer tool @sister

Behavior Synthesizer @Sister 是open source的project, 用在 high level SystemC 2 RTL Verilog. 有興趣的人可以下載來玩玩. Step1. 前端用Lex && Yacc 當Parser,找出符合Grammar的Token. 如 Module Name(module *name), input(sc_in/sc_out), Method(sc_method/sc_thread...) , Process( void xxx() ) Step2. 把建立好的Tokens 存入 Node tree上, 再根據現有的constrain 做 scheduling 的動作, 調整每個 Node 的時間先後順序還有 Node 的相依性, 最後做 Nodes 的化簡.可減少硬體的coast. Step3. Translate 2 Verilog format

Refs: lex && yacc Graph theory

2010年8月6日星期五

SmartReflex™ Power and Performance Management Technologies

透過不同層級的Model來達到Power Saving的功效. Silicon IP: CMOS: 藉由改變Channel length,跟thermal voltage的方式讓CMOS能夠在很低的Voltage supply也能被driver. 或者透過 inverse VT 的方式阻斷VDD 到 GND 的short, 降低Leakage power 的消耗. Ref: Channel length modulation GATE: 用Power Gating / Clock Gating 的方式來Gate 掉不必要的switch, 減少Dynamic Power的消耗. Level Shift: 主要用在power supply VDD/VSS上, 藉由改變不同的電壓supply來調整Channel length的大小.做到快速/慢速的charge.來影響導通的速度. Ref: Dual-Supply Interface Level Shifter CMOS Logic ICs Soc Design: DVFS(dynamic voltage frequency scaling) 動態的調整 voltage / frequency , 利用最少的voltage / frequency 來達到最大的 performance, 且能符合 timing 的 constrain. Refs: Voltage and frequency scaling DVFS emulator Multi clock / power domain 把parting 過的結果做power / clock 的切割, 分成不同的 clock / power domain, 把相同的clock / power bound 在一起,可減少硬體控制跟routing 的複雜度. Software : Power management Refs:power management 4 Linux peak power power monitor part1 底下用 "Battery contain" 跟不同 "Task" 所消耗的power, 來模擬行動裝置的 Battery condition. battery.c



#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>

#define BATTERY_CONTAIN 5000 //unint power contain
#define TEST_COT        3

pthread_mutex_t count_mutex     = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t condition_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t  condition_cond  = PTHREAD_COND_INITIALIZER;


enum BATTERY_STATUS{
 BATTERY_FULL =0,
 BATTERY_EMPTY =1,
 BATTERY_HALF =2,
 BATTERY_LOWER =3,
 BATTERY_OK =4,
 BATTERY_ERROR =5,
};


enum TASK_STATUS{
 TASK_VEDIO =0,
 TASK_GAME =1,
 TASK_PHONE =2,
 TASK_WALKMAN =3,
 TASK_IDLE =4,
 TASK_OFF =7,
};


struct WORK_STATUS{
 char          *name;
 unsigned int  on_upw;
 unsigned int  slp_upw;
 unsigned int  off_upw;
} WORKList [] = {
 { "TFT" ,100, 50, 10 },
 { "KEYBOARD" , 10,  5,  1 },
 { "WIFI" ,500,200, 10 },
 { "GSM" ,700,500, 30 },
};

struct BATTERY{
 char          *name;
 unsigned int  remain;
} BATTERYList [] = {
  { "iBATTERY" , 100 },
};
 
struct TASK{
 char          *name;
 unsigned int  sel;
} TASKList [] = {
  { "TASK_OFF"    , TASK_OFF    },
  { "TASK_IDLE"   , TASK_IDLE   },
  { "TASK_VEDIO"  , TASK_VEDIO  },
  { "TASK_PHONE"  , TASK_PHONE  },
  { "TASK_IDLE"   , TASK_IDLE   },
  { "TASK_WALKMAN", TASK_WALKMAN},
};


int     COT =0;

 
void Set_WORK_STATUS(struct WORK_STATUS *Ptr,char *nm,unsigned int on,unsigned int slp,unsigned int off){
                     Ptr->name   = nm;
                     Ptr->on_upw = on;
                     Ptr->slp_upw= slp;
                     Ptr->off_upw= off;
}

void Set_BATTERY(struct BATTERY *Ptr,char *nm,unsigned int rman){
                     Ptr->name   = nm;
                     Ptr->remain = rman;
}

void Set_TASK(struct TASK *Ptr,char *nm,unsigned int sel){
                     Ptr->name = nm;
                     Ptr->sel  = sel;
}

int Check_BATTERY_STATUS(struct BATTERY *Ptr){
                   if((                      Ptr->remain >     BATTERY_CONTAIN) ||
                      (0.8*BATTERY_CONTAIN < Ptr->remain && Ptr->remain <=    BATTERY_CONTAIN) ){ return BATTERY_FULL;  }
              else if( 0.6*BATTERY_CONTAIN < Ptr->remain && Ptr->remain <= 0.8*BATTERY_CONTAIN ){ return BATTERY_OK;    }
              else if( 0.4*BATTERY_CONTAIN < Ptr->remain && Ptr->remain <= 0.6*BATTERY_CONTAIN ){ return BATTERY_HALF;  }
              else if( 0.2*BATTERY_CONTAIN < Ptr->remain && Ptr->remain <= 0.4*BATTERY_CONTAIN ){ return BATTERY_LOWER; }
              else if(   0*BATTERY_CONTAIN < Ptr->remain && Ptr->remain <= 0.2*BATTERY_CONTAIN ){ return BATTERY_EMPTY; }
              else   {                                                                            return BATTERY_ERROR; }
}

void Charge_BATTERY(struct BATTERY *Ptr){
    if( Check_BATTERY_STATUS(Ptr) != BATTERY_FULL ){
         Ptr->remain += 50;
   }
}


int Check_TASK_STATUS(struct BATTERY *Ptr,struct TASK *Ttr){

    int battery_status = Check_BATTERY_STATUS(Ptr);
    unsigned int consume;
 
      switch(Ttr->sel){
                          // VEDIO  : TFT(ON)    KEYBOARD(SLEEP)  WIFI(ON)      GSM(SLEEP)
                          // GAME   : TFT(ON)    KEYBOARD(ON)     WIIF(SLEEP)   GSM(SLEEP)
                          // PHONE  : TFT(SLEEP) KEYBOARD(SLEEP)  WIFI(SLEEP)   GSM(ON)
                          // WALKMAN: TFT(SLEEP) KWYBOARD(ON)     WIFI(SLEEP)   GSM(SLEEP)
                          // IDLE   : TFT(SLEEP) KEYBOARD(SLEEP)  WIFI(SLEEP)   GSM(SLEEP)
                          // OFF    : TFT(OFF)   KEYBOARD(OFF)    WIFI(OFF)     GSM(OFF) 
 case TASK_VEDIO   : consume = WORKList[0].on_upw  + WORKList[1].slp_upw + WORKList[2].on_upw  + WORKList[3].slp_upw; break; 
 case TASK_GAME   : consume = WORKList[0].on_upw  + WORKList[1].on_upw  + WORKList[2].slp_upw + WORKList[3].slp_upw; break;
 case TASK_PHONE   : consume = WORKList[0].slp_upw + WORKList[1].slp_upw + WORKList[2].slp_upw + WORKList[3].on_upw;  break;
 case TASK_WALKMAN : consume = WORKList[0].slp_upw + WORKList[1].on_upw  + WORKList[2].slp_upw + WORKList[3].slp_upw; break;
 case TASK_IDLE   : consume = WORKList[0].slp_upw + WORKList[1].slp_upw + WORKList[2].slp_upw + WORKList[3].slp_upw; break;
 case TASK_OFF   : consume = WORKList[0].off_upw + WORKList[1].off_upw + WORKList[2].off_upw + WORKList[3].off_upw; break;
        default           : return -1; break;
    }

   if( battery_status != BATTERY_EMPTY || battery_status != BATTERY_ERROR ){
             if( 0.9*Ptr->remain > consume ){ Ptr->remain -= consume; return 0; }
             else{                                                    return -1;}
      } 
}

void *Emulator_BATTERY(void *t){
     int i;
     struct   BATTERY *BATTERYPtr     = &BATTERYList[0];
 
      while(COT <= TEST_COT){
         for(i=0; i<30; i++){
           pthread_mutex_lock(&count_mutex);
          if( Check_BATTERY_STATUS(BATTERYPtr) == BATTERY_ERROR ){ pthread_exit(NULL); }
           
           Charge_BATTERY(BATTERYPtr);
           printf("0::%d\n", BATTERYPtr->remain);
 
   //        pthread_cond_signal(&condition_cond);
           pthread_mutex_unlock(&count_mutex);
           sleep(1); 
          }
        sleep(20);
      }

    if( COT == TEST_COT ){ pthread_exit(NULL); }

}

void *Emulator_TASK(void *t){
     int      i,j,rst;
     struct   BATTERY     *BATTERYPtr     = &BATTERYList[0];
     struct   TASK        *TASKPtr;
     
        if( Check_BATTERY_STATUS(BATTERYPtr) == BATTERY_ERROR ){ pthread_exit(NULL); }  
        
       if(COT <= TEST_COT ){ 
          for(i=0; i<sizeof(TASKList)/sizeof(*TASKList); i++){
            TASKPtr = &TASKList[i];
            j=0;

            while(j<3){
             pthread_mutex_lock(&count_mutex);
 //            pthread_cond_wait(&condition_cond,&count_mutex); 
             rst = Check_TASK_STATUS(BATTERYPtr,TASKPtr);
             if( rst ==0 ){  printf("1::%d,%d\n", i,BATTERYPtr->remain );  }
             pthread_mutex_unlock(&count_mutex);
             
              if( rst==0 ){
                  j++;
                  sleep(0.5);
               } else {
                 printf("Out of Battery contain...Please turn off App\n");
                 j =0;
                 sleep(50);
               }
            }
            printf("App done ...\n");       
            sleep(1); 
         }
         COT++;
        }

    if( COT == TEST_COT ){ pthread_exit(NULL); }
}
 
int main(int argc, char *argv[]){

 pthread_t thread_1,thread_2;

 pthread_create( &thread_1,NULL, Emulator_BATTERY, NULL);
 pthread_create( &thread_2,NULL, Emulator_TASK, NULL);
                      
 pthread_join( thread_1,NULL);
 pthread_join( thread_2,NULL);

return 0;
}

compile

gcc -lpthread -o tt battery.c

code download here Refs : Second-Generation SmartReflex™ Power and Performance Management Technologies Address power management issues in mobiles

訂閱：文章 (Atom)

learning plus

2010年8月30日星期一

ARM Cortex Ax with NEON SIMD

2010年8月29日星期日

NetWork on Chip @c

2010年8月26日星期四

uBoot Case Study @ omap3

AMBA 4.0 AXI Bus Pt2

2010年8月24日星期二

AMBA 4.0 AXI Bus Pt1

2010年8月23日星期一

ARM vs X86

Sharkv2 2010_06~08 profit reports pt2

2010年8月20日星期五

Sharkv2 2010_06~08 profit reports pt1

2010年8月18日星期三

Lex & Yacc case study @ PLY

2010年8月17日星期二

Path dealy @ graphviz Pt1

2010年8月15日星期日

Shortest Path Algorithm

2010年8月13日星期五

Minimum Spanning Tree Algorithm

Connected Components Algorithm

2010年8月12日星期四

Booth Algorithm

ACPI @ Linux kernel

2010年8月11日星期三

XML Viewer @ SOC

2010年8月10日星期二

Behavior Synthesizer tool @sister

2010年8月6日星期五

SmartReflex™ Power and Performance Management Technologies

copyright

2010年8月30日 星期一

2010年8月29日 星期日

2010年8月26日 星期四

2010年8月24日 星期二

2010年8月23日 星期一

2010年8月20日 星期五

2010年8月18日 星期三

2010年8月17日 星期二

2010年8月15日 星期日

2010年8月13日 星期五

2010年8月12日 星期四

2010年8月11日 星期三

2010年8月10日 星期二

2010年8月6日 星期五

2010年8月30日星期一

2010年8月29日星期日

2010年8月26日星期四

2010年8月24日星期二

2010年8月23日星期一

2010年8月20日星期五

2010年8月18日星期三

2010年8月17日星期二

2010年8月15日星期日

2010年8月13日星期五

2010年8月12日星期四

2010年8月11日星期三

2010年8月10日星期二

2010年8月6日星期五