25 #include "wrapperMPI.h"    27 void zgemm_(
char *TRANSA, 
char *TRANSB, 
int *M, 
int *N, 
int *K, 
double complex *ALPHA, 
double complex *matJL, 
int *LDA, 
double complex *arrayz, 
int *LDB, 
double complex *BETA, 
double complex *arrayx, 
int *LDC);
    38   double complex *tmp_v0 ,
    39   double complex *tmp_v1 ,
    40   double complex *tmp_v2 ,
    41   double complex *tmp_v3 
    52   int M, N, K, LDA, LDB, LDC;
    53   double complex ALPHA, BETA;  
    54   long unsigned int i_max;
    55   long unsigned int j, k, ell, iloop;
    56   long unsigned int i1, i2;
    57   long unsigned int iomp;
    58   long unsigned int ell4, ell5, ell6, m0, Ipart1;
    59   long unsigned int mi, mj, mri, mrj, mrk, mrl;
    61   long unsigned int ellrl, ellrk, ellrj, ellri, elli1, elli2, ellj1, ellj2;
    62   long unsigned int iSS1, iSS2, iSSL1, iSSL2;
    63   double complex **vecJ;
    64   double complex **matJ, **matJ2;
    65   double complex *matJL;
    67   double complex **matB;
    68   double complex *arrayz;
    69   double complex *arrayx;
    70   double complex *arrayw;
    71   long unsigned int ishift1, ishift2, ishift3, ishift4, ishift5, pivot_flag, num_J_star;
    72   long unsigned int pow4, pow5, pow41, pow51;  
    75   i_max = 
X->Check.idim_max;
    86   c_malloc2(vecJ, 3, 3); 
    87   c_malloc2(matJ, 4, 4); 
    88   c_malloc2(matJ2, 4, 4); 
    89   c_malloc2(matB, 2, 2); 
    90   c_malloc1(matJL, (64*64)); 
    91   c_malloc1(matI, (64*64)); 
    99   for(iloop=0; iloop < 
X->Boost.R0; iloop++){
   102     for(j=iloop*
X->Boost.num_pivot; j < (iloop+1)*
X->Boost.num_pivot; j++){
   104       num_J_star = (
long unsigned int)
X->Boost.list_6spin_star[j][0]; 
   105       ishift1    = (
long unsigned int)
X->Boost.list_6spin_star[j][1]; 
   106       ishift2    = (
long unsigned int)
X->Boost.list_6spin_star[j][2]; 
   107       ishift3    = (
long unsigned int)
X->Boost.list_6spin_star[j][3]; 
   108       ishift4    = (
long unsigned int)
X->Boost.list_6spin_star[j][4]; 
   109       ishift5    = (
long unsigned int)
X->Boost.list_6spin_star[j][5]; 
   110       pivot_flag = (
long unsigned int)
X->Boost.list_6spin_star[j][6]; 
   114       pow4 = (
int)pow(2.0,ishift1+ishift2+ishift3+ishift4);
   115       pow5 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5);
   119       pow41= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+1);
   120       pow51= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+1);
   122       for(k=0; k < (64*64); k++){
   123         matJL[k] = 0.0 + 0.0*I;
   124         matI[k]  = 0.0 + 0.0*I;
   126       for(k=0; k < 64; k++){
   130       for(ell=0; ell < num_J_star; ell++){
   131         mi   = (
long unsigned int)
X->Boost.list_6spin_pair[j][0][ell]; 
   132         mj   = (
long unsigned int)
X->Boost.list_6spin_pair[j][1][ell]; 
   133         mri  = (
long unsigned int)
X->Boost.list_6spin_pair[j][2][ell]; 
   134         mrj  = (
long unsigned int)
X->Boost.list_6spin_pair[j][3][ell]; 
   135         mrk  = (
long unsigned int)
X->Boost.list_6spin_pair[j][4][ell]; 
   136         mrl  = (
long unsigned int)
X->Boost.list_6spin_pair[j][5][ell]; 
   137         indj = 
X->Boost.list_6spin_pair[j][6][ell]; 
   138         for(i1 = 0; i1 < 3; i1++){
   139           for(i2 = 0; i2 < 3; i2++){
   140             vecJ[i1][i2] = 
X->Boost.arrayJ[(indj-1)][i1][i2];
   144         matJ[0][0] = vecJ[2][2];
   146         matJ[0][1] = vecJ[0][0]-vecJ[1][1]-I*vecJ[0][1]-I*vecJ[1][0];
   148         matJ[0][2] = vecJ[2][0]-I*vecJ[2][1];
   150         matJ[0][3] = vecJ[0][2]-I*vecJ[1][2];
   152         matJ[1][0] = vecJ[0][0]-vecJ[1][1]+I*vecJ[0][1]+I*vecJ[1][0];
   154         matJ[1][1] = vecJ[2][2];
   156         matJ[1][2] =(-1.0)*vecJ[0][2]-I*vecJ[1][2];
   158         matJ[1][3] =(-1.0)*vecJ[2][0]-I*vecJ[2][1];
   160         matJ[2][0] = vecJ[2][0]+I*vecJ[2][1];
   162         matJ[2][1] =(-1.0)*vecJ[0][2]+I*vecJ[1][2];
   164         matJ[2][2] =(-1.0)*vecJ[2][2];
   166         matJ[2][3] = vecJ[0][0]+vecJ[1][1]+I*vecJ[0][1]-I*vecJ[1][0];
   168         matJ[3][0] = vecJ[0][2]+I*vecJ[1][2];
   170         matJ[3][1] =(-1.0)*vecJ[2][0]+I*vecJ[2][1];
   172         matJ[3][2] = vecJ[0][0]+vecJ[1][1]-I*vecJ[0][1]+I*vecJ[1][0];
   174         matJ[3][3] =(-1.0)*vecJ[2][2];
   176         matJ2[3][3] = matJ[0][0]; 
   177         matJ2[3][0] = matJ[0][1]; 
   178         matJ2[3][1] = matJ[0][2]; 
   179         matJ2[3][2] = matJ[0][3]; 
   180         matJ2[0][3] = matJ[1][0]; 
   181         matJ2[0][0] = matJ[1][1]; 
   182         matJ2[0][1] = matJ[1][2]; 
   183         matJ2[0][2] = matJ[1][3]; 
   184         matJ2[1][3] = matJ[2][0]; 
   185         matJ2[1][0] = matJ[2][1];
   186         matJ2[1][1] = matJ[2][2]; 
   187         matJ2[1][2] = matJ[2][3]; 
   188         matJ2[2][3] = matJ[3][0]; 
   189         matJ2[2][0] = matJ[3][1]; 
   190         matJ2[2][1] = matJ[3][2]; 
   191         matJ2[2][2] = matJ[3][3]; 
   193         for(ellri=0; ellri<2; ellri++){
   194         for(ellrj=0; ellrj<2; ellrj++){
   195         for(ellrk=0; ellrk<2; ellrk++){
   196         for(ellrl=0; ellrl<2; ellrl++){
   197           for(elli1=0; elli1<2; elli1++){
   198           for(ellj1=0; ellj1<2; ellj1++){
   199           for(elli2=0; elli2<2; elli2++){
   200           for(ellj2=0; ellj2<2; ellj2++){
   202             iSSL1 = elli1*(int)pow(2,mi) + ellj1*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
   203             iSSL2 = elli2*(int)pow(2,mi) + ellj2*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
   204             iSS1  = elli1 + 2*ellj1;
   205             iSS2  = elli2 + 2*ellj2;
   206             matJL[iSSL1+64*iSSL2] += matJ2[iSS1][iSS2];
   221         matB[0][0] = + 
X->Boost.vecB[2]; 
   222         matB[1][1] = - 
X->Boost.vecB[2]; 
   225         matB[0][1] = - 
X->Boost.vecB[0] - I*
X->Boost.vecB[1]; 
   226         matB[1][0] = - 
X->Boost.vecB[0] + I*
X->Boost.vecB[1]; 
   227         for(ellri=0; ellri<2; ellri++){
   228         for(ellrj=0; ellrj<2; ellrj++){
   229         for(ellrk=0; ellrk<2; ellrk++){
   230         for(ellrl=0; ellrl<2; ellrl++){
   231         for(ellj1=0; ellj1<2; ellj1++){
   232           for(elli1=0; elli1<2; elli1++){
   233           for(elli2=0; elli2<2; elli2++){
   234             for(ellj2=0; ellj2<
X->Boost.ishift_nspin; ellj2++){
   235               iSSL1 = elli1*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
   236               iSSL2 = elli2*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
   237               matJL[iSSL1+64*iSSL2] += matB[elli1][elli2];
   249       iomp=i_max/(int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+2);
   251       #pragma omp parallel default(none) private(arrayx,arrayz,arrayw,ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \   252       shared(matJL,matI,iomp,i_max,myrank,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,tmp_v0,tmp_v1,tmp_v3)   255             c_malloc1(arrayx, (64*((
int)pow(2.0,ishift4+ishift5-1))));
   256             c_malloc1(arrayz, (64*((
int)pow(2.0,ishift4+ishift5-1))));
   257             c_malloc1(arrayw, (64*((
int)pow(2.0,ishift4+ishift5-1))));
   260         for(ell6 = 0; ell6 < iomp; ell6++){
   262           for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
   263         for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
   264           for(m0 = 0; m0 < 16; m0++){   
   265         arrayz[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4          +pow41*ell5+Ipart1)];
   266         arrayz[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+Ipart1)];
   267         arrayz[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+Ipart1)];
   268         arrayz[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
   269         tmp_v3[(1 + m0+16*ell4          +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4          +pow41*ell5+Ipart1)];
   270         tmp_v3[(1 + m0+16*ell4+pow4     +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+Ipart1)];
   271         tmp_v3[(1 + m0+16*ell4+pow5     +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+Ipart1)];
   272         tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
   273         arrayx[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4          +pow41*ell5+Ipart1)];
   274         arrayx[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4     +pow41*ell5+Ipart1)];
   275         arrayx[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow5     +pow41*ell5+Ipart1)];
   276         arrayx[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
   282           for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
   283         for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
   284           for(m0 = 0; m0 < 16; m0++){
   285         arrayz[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4          +pow41*ell5+pow51+Ipart1)];
   286         arrayz[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+pow51+Ipart1)];
   287         arrayz[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+pow51+Ipart1)];
   288         arrayz[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
   289         tmp_v3[(1 + m0+16*ell4          +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4          +pow41*ell5+pow51+Ipart1)];
   290         tmp_v3[(1 + m0+16*ell4+pow4     +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+pow51+Ipart1)];
   291         tmp_v3[(1 + m0+16*ell4+pow5     +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+pow51+Ipart1)];
   292         tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
   293         arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4          +pow41*ell5+pow51+Ipart1)];
   294         arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4     +pow41*ell5+pow51+Ipart1)];
   295         arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow5     +pow41*ell5+pow51+Ipart1)];
   296         arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
   305           N = (int)pow(2.0, ishift4+ishift5-1);
   313             zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matJL,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC);
   333           for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
   334           for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
   335             for(m0 = 0; m0 < 16; m0++){
   336               tmp_v1[(1 + m0+16*ell4          +pow41*ell5+Ipart1)]       = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
   337               tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+Ipart1)]       = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
   338               tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+Ipart1)]       = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
   339               tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]       = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
   343           for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
   344           for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
   345             for(m0 = 0; m0 < 16; m0++){
   346               tmp_v1[(1 + m0+16*ell4          +pow41*ell5+pow51+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
   347               tmp_v1[(1 + m0+16*ell4+pow4     +pow41*ell5+pow51+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
   348               tmp_v1[(1 + m0+16*ell4+pow5     +pow41*ell5+pow51+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
   349               tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
   355       c_free1(arrayz, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
   356       c_free1(arrayx, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
   357       c_free1(arrayw, (64*((
int)pow(2.0,ishift4+ishift5-1))) );
   362         iomp=i_max/(int)pow(2.0,
X->Boost.ishift_nspin);
   363         #pragma omp parallel for default(none) private(ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \   364         firstprivate(iomp) shared(i_max,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,X,tmp_v0,tmp_v1)   365         for(ell5 = 0; ell5 < iomp; ell5++ ){
   366           for(ell4 = 0; ell4 < (int)pow(2.0,
X->Boost.ishift_nspin); ell4++){
   367             tmp_v0[(1 + ell5+(i_max/(int)pow(2.0,
X->Boost.ishift_nspin))*ell4)] = tmp_v1[(1 + ell4+((int)pow(2.0,
X->Boost.ishift_nspin))*ell5)];
   370         iomp=i_max/(int)pow(2.0,
X->Boost.ishift_nspin);
   371         #pragma omp parallel for default(none) private(ell4,ell5) \   372         firstprivate(iomp) shared(i_max,X,tmp_v1,tmp_v3)   373         for(ell5 = 0; ell5 < iomp; ell5++ ){
   374           for(ell4 = 0; ell4 < (int)pow(2.0,
X->Boost.ishift_nspin); ell4++){
   375             tmp_v1[(1 + ell5+(i_max/(int)pow(2.0,
X->Boost.ishift_nspin))*ell4)] = tmp_v3[(1 + ell4+((int)pow(2.0,
X->Boost.ishift_nspin))*ell5)];
   380         #pragma omp parallel for default(none) private(ell4) \   381         shared(i_max,tmp_v0,tmp_v1,tmp_v3)   382         for(ell4 = 0; ell4 < i_max; ell4++ ){
   383           tmp_v0[1 + ell4] = tmp_v1[1 + ell4];
   384           tmp_v1[1 + ell4] = tmp_v3[1 + ell4];
   394     MPI_Alltoall(&tmp_v1[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
   395     MPI_Alltoall(&tmp_v0[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
   398     iomp=(int)pow(2.0,
X->Boost.W0)/
nproc;
   399     #pragma omp parallel for default(none) private(ell4,ell5,ell6) \   400     firstprivate(iomp) shared(i_max,X,nproc,tmp_v0,tmp_v1,tmp_v2,tmp_v3)   402     for(ell4 = 0; ell4 < iomp; ell4++ ){
   403       for(ell5 = 0; ell5 < 
nproc; ell5++ ){
   404         for(ell6 = 0; ell6 < (int)(i_max/(
int)pow(2.0,
X->Boost.W0)); ell6++ ){
   405           tmp_v1[(1 + ell6+ell5*i_max/(int)pow(2.0,
X->Boost.W0)+ell4*i_max/((int)pow(2.0,
X->Boost.W0)/
nproc))] = tmp_v3[(1 + ell6+ell4*i_max/(int)pow(2.0,
X->Boost.W0)+ell5*i_max/
nproc)];
   406           tmp_v0[(1 + ell6+ell5*i_max/(int)pow(2.0,
X->Boost.W0)+ell4*i_max/((int)pow(2.0,
X->Boost.W0)/
nproc))] = tmp_v2[(1 + ell6+ell4*i_max/(int)pow(2.0,
X->Boost.W0)+ell5*i_max/
nproc)];
   427   c_free2(matJ2, 4, 4);
   429   c_free1(matJL, (64*64));
   430   c_free1(matI, (64*64));
 void child_general_int_spin_MPIBoost(struct BindStruct *X, double complex *tmp_v0, double complex *tmp_v1, double complex *tmp_v2, double complex *tmp_v3)
int nproc
Number of processors, defined in InitializeMPI() 
void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC)