Re: Questions on performance methods


naohiko shimizu (nshimizu@et.u-tokai.ac.jp)
Mon, 30 Nov 1998 11:54:35 +0900 (JST)


If the N is large (N > 10000), the code I and Goto wrote will use too much
memory traffic. Then we still have a chance to improve the
performance with blocking. Last night it was almost 3am in localtime
when I read the mail, and I had not enough time to code.

See the following, how to make the blocked code. (Still not debugged yet.)
(BTW, it is like a step by step tutorials for code tuning.)

In article <199811291821.DAA16533@mail.fa2.so-net.ne.jp>
pshimizu@fa2.so-net.ne.jp writes:

----------------------------------------------------------------------------
#define BLOCKINGSIZE 1000

void dsqrtiv(double *, double *, int);

double find_jastrow_sub(double *vtemp, double b, int count) {
  int i;
  double tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
  double vtmp[8];

    dsqrtiv(vtemp, vtemp, count);

/*
Unrolling and software pipelining the loop.
8 times unrolling is enough for the Scache pipeline.
*/
    i = 0;
    tmp0 = tmp1 = tmp2 = tmp3 =
    tmp4 = tmp5 = tmp6 = tmp7 = 1.0;
    vtmp[0] = vtemp[i];
    vtmp[1] = vtemp[i+1];
    vtmp[2] = vtemp[i+2];
    vtmp[3] = vtemp[i+3];
    vtmp[4] = vtemp[i+4];
    vtmp[5] = vtemp[i+5];
    vtmp[6] = vtemp[i+6];
    vtmp[7] = vtemp[i+7];
    for(i=0;i < count-8; i+=8){
          tmp0 *= (1.0 - b * vtmp[0]);
          tmp1 *= (1.0 - b * vtmp[1]);
          tmp2 *= (1.0 - b * vtmp[2]);
          tmp3 *= (1.0 - b * vtmp[3]);
          tmp4 *= (1.0 - b * vtmp[4]);
          tmp5 *= (1.0 - b * vtmp[5]);
          tmp6 *= (1.0 - b * vtmp[6]);
          tmp7 *= (1.0 - b * vtmp[7]);
          vtmp[0] = vtemp[i+8];
          vtmp[1] = vtemp[i+9];
          vtmp[2] = vtemp[i+10];
          vtmp[3] = vtemp[i+11];
          vtmp[4] = vtemp[i+12];
          vtmp[5] = vtemp[i+13];
          vtmp[6] = vtemp[i+14];
          vtmp[7] = vtemp[i+15];
    }
/*
The rest of the loop will be here.
*/
    for(i=0;i < count - i;i++)
          tmp0 *= (1.0 - b * vtmp[i]);
    return tmp0*tmp1*tmp2*tmp3*tmp4*tmp5*tmp6*tmp7;
  }

int find_jastrow(double *result,int j,struct coord *q,double r[3],double b){

  int i,vcnt;
  double tmp, tmp1, tmp2, tmp3;
  double r1, r2, r3;
  double bp2;
  double vtemp[BLOCKINGSIZE+16];

  r1 = r[0];
  r2 = r[1];
  r3 = r[2];
  
  tmp = 1.0;

  vcnt = 0;
  bp2 = b*b;

  if(b>0){
/*
 To reduce inner loop conditional branch, I separate the loop.
 This may not be required if the compiler is smart enough.
 When b**2 < rij, we should return immediately, without calling sqrt.
 Then I add one conditional branch *_*.
*/
    i = 0;

    tmp1 = q[i].x[0]-r1;
    tmp2 = q[i].x[1]-r2;
    tmp3 = q[i].x[2]-r3;

    for(i=0;i < j;i++){

        tmp1 = tmp1*tmp1;
        tmp2 = tmp2*tmp2;
        tmp3 = tmp3*tmp3;

        if((vtemp[vcnt++] = tmp1 + tmp2 + tmp3) > bp2) {
             *result = 0.0;
             return 0;
        }
/*
You may want to prefetch the q[i+1].x here, to reduce the stall
*/
        if(vcnt >= BLOCKINGSIZE) {
             tmp *= find_jastrow_sub(vtemp, b, vcnt);
             vcnt = 0;
        }
        tmp1 = q[i+1].x[0]-r1;
        tmp2 = q[i+1].x[1]-r2;
        tmp3 = q[i+1].x[2]-r3;

      }

    tmp1 = q[j+1].x[0]-r1;
    tmp2 = q[j+1].x[1]-r2;
    tmp3 = q[j+1].x[2]-r3;

    for(i=j+1;i < N;i++){

        tmp1 = tmp1*tmp1;
        tmp2 = tmp2*tmp2;
        tmp3 = tmp3*tmp3;

        if((vtemp[vcnt++] = tmp1 + tmp2 + tmp3) > bp2) {
             *result = 0.0;
             return 0;
        }
/*
You may want to prefetch the q[i+1].x here, to reduce the stall
*/
        if(vcnt >= BLOCKINGSIZE) {
             tmp *= find_jastrow_sub(vtemp, b, vcnt);
             vcnt = 0;
        }
        tmp1 = q[i+1].x[0]-r1;
        tmp2 = q[i+1].x[1]-r2;
        tmp3 = q[i+1].x[2]-r3;

      }
    tmp *= find_jastrow_sub(vtemp, b, vcnt);
  }
  *result = tmp;
  return 1;
}

Naohiko Shimizu
Dept. Communication Engr./Univ. TOKAI.
1117 Kitakaname Hiratsuka 259-12 Japan
TEL.+81-463-58-1211(ext. 4084) FAX.+81-463-58-8320
<URL http://shimizu-lab.et.u-tokai.ac.jp/>



This archive was generated by hypermail 2.0b3 on Mon Nov 30 1998 - 22:10:34 EST