naohiko shimizu (nshimizu@et.u-tokai.ac.jp)
Mon, 30 Nov 1998 11:54:35 +0900 (JST)
If the N is large (N > 10000), the code I and Goto wrote will use too much
memory traffic. Then we still have a chance to improve the
performance with blocking. Last night it was almost 3am in localtime
when I read the mail, and I had not enough time to code.
See the following, how to make the blocked code. (Still not debugged yet.)
(BTW, it is like a step by step tutorials for code tuning.)
In article <199811291821.DAA16533@mail.fa2.so-net.ne.jp>
pshimizu@fa2.so-net.ne.jp writes:
----------------------------------------------------------------------------
#define BLOCKINGSIZE 1000
void dsqrtiv(double *, double *, int);
double find_jastrow_sub(double *vtemp, double b, int count) {
int i;
double tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
double vtmp[8];
dsqrtiv(vtemp, vtemp, count);
/*
Unrolling and software pipelining the loop.
8 times unrolling is enough for the Scache pipeline.
*/
i = 0;
tmp0 = tmp1 = tmp2 = tmp3 =
tmp4 = tmp5 = tmp6 = tmp7 = 1.0;
vtmp[0] = vtemp[i];
vtmp[1] = vtemp[i+1];
vtmp[2] = vtemp[i+2];
vtmp[3] = vtemp[i+3];
vtmp[4] = vtemp[i+4];
vtmp[5] = vtemp[i+5];
vtmp[6] = vtemp[i+6];
vtmp[7] = vtemp[i+7];
for(i=0;i < count-8; i+=8){
tmp0 *= (1.0 - b * vtmp[0]);
tmp1 *= (1.0 - b * vtmp[1]);
tmp2 *= (1.0 - b * vtmp[2]);
tmp3 *= (1.0 - b * vtmp[3]);
tmp4 *= (1.0 - b * vtmp[4]);
tmp5 *= (1.0 - b * vtmp[5]);
tmp6 *= (1.0 - b * vtmp[6]);
tmp7 *= (1.0 - b * vtmp[7]);
vtmp[0] = vtemp[i+8];
vtmp[1] = vtemp[i+9];
vtmp[2] = vtemp[i+10];
vtmp[3] = vtemp[i+11];
vtmp[4] = vtemp[i+12];
vtmp[5] = vtemp[i+13];
vtmp[6] = vtemp[i+14];
vtmp[7] = vtemp[i+15];
}
/*
The rest of the loop will be here.
*/
for(i=0;i < count - i;i++)
tmp0 *= (1.0 - b * vtmp[i]);
return tmp0*tmp1*tmp2*tmp3*tmp4*tmp5*tmp6*tmp7;
}
int find_jastrow(double *result,int j,struct coord *q,double r[3],double b){
int i,vcnt;
double tmp, tmp1, tmp2, tmp3;
double r1, r2, r3;
double bp2;
double vtemp[BLOCKINGSIZE+16];
r1 = r[0];
r2 = r[1];
r3 = r[2];
tmp = 1.0;
vcnt = 0;
bp2 = b*b;
if(b>0){
/*
To reduce inner loop conditional branch, I separate the loop.
This may not be required if the compiler is smart enough.
When b**2 < rij, we should return immediately, without calling sqrt.
Then I add one conditional branch *_*.
*/
i = 0;
tmp1 = q[i].x[0]-r1;
tmp2 = q[i].x[1]-r2;
tmp3 = q[i].x[2]-r3;
for(i=0;i < j;i++){
tmp1 = tmp1*tmp1;
tmp2 = tmp2*tmp2;
tmp3 = tmp3*tmp3;
if((vtemp[vcnt++] = tmp1 + tmp2 + tmp3) > bp2) {
*result = 0.0;
return 0;
}
/*
You may want to prefetch the q[i+1].x here, to reduce the stall
*/
if(vcnt >= BLOCKINGSIZE) {
tmp *= find_jastrow_sub(vtemp, b, vcnt);
vcnt = 0;
}
tmp1 = q[i+1].x[0]-r1;
tmp2 = q[i+1].x[1]-r2;
tmp3 = q[i+1].x[2]-r3;
}
tmp1 = q[j+1].x[0]-r1;
tmp2 = q[j+1].x[1]-r2;
tmp3 = q[j+1].x[2]-r3;
for(i=j+1;i < N;i++){
tmp1 = tmp1*tmp1;
tmp2 = tmp2*tmp2;
tmp3 = tmp3*tmp3;
if((vtemp[vcnt++] = tmp1 + tmp2 + tmp3) > bp2) {
*result = 0.0;
return 0;
}
/*
You may want to prefetch the q[i+1].x here, to reduce the stall
*/
if(vcnt >= BLOCKINGSIZE) {
tmp *= find_jastrow_sub(vtemp, b, vcnt);
vcnt = 0;
}
tmp1 = q[i+1].x[0]-r1;
tmp2 = q[i+1].x[1]-r2;
tmp3 = q[i+1].x[2]-r3;
}
tmp *= find_jastrow_sub(vtemp, b, vcnt);
}
*result = tmp;
return 1;
}
Naohiko Shimizu
Dept. Communication Engr./Univ. TOKAI.
1117 Kitakaname Hiratsuka 259-12 Japan
TEL.+81-463-58-1211(ext. 4084) FAX.+81-463-58-8320
<URL http://shimizu-lab.et.u-tokai.ac.jp/>
This archive was generated by hypermail 2.0b3 on Mon Nov 30 1998 - 22:10:34 EST