subroutine p_z_matmul(A,B,C,Nblock,alfa,beta)
complex(8),dimension(:,:),intent(in) :: A ![N,K]
complex(8),dimension(:,:),intent(in) :: B ![K,M]
complex(8),dimension(size(A,1),size(B,2)),intent(inout) :: C ![N,M]
integer :: Nblock
real(8),optional :: alfa,beta
real(8) :: alfa_,beta_
integer :: Nb
integer :: N,K,M
integer :: Nrows,Kcols,Krows,Mcols
!
integer :: i,j,lda,info
integer,external :: numroc,indxG2L,indxG2P,indxL2G
!
integer,dimension(9) :: descAloc,descBloc,descCloc,descC
complex(8),dimension(:,:),allocatable :: Aloc,Bloc,Cloc
integer :: rankX,rankY
integer :: sendR,sendC,Nipiv
integer :: Nrow,Ncol
integer :: myi,myj,unit
real(8) :: t_stop,t_start
logical :: master
!
! C = alfa*A*B + beta*C
!
alfa_ = dcmplx(1d0,0d0) ;if(present(alfa))alfa_=alfa
beta_ = dcmplx(0d0,0d0) ;if(present(beta))beta_=beta
!
!
N = size(A,1)
K = size(A,2) !==size(B,1)
M = size(B,2)
if(any(shape(B)/=[K,M]))stop "p_d_matmul error: B has illegal shape"
if(any(shape(C)/=[N,M]))stop "p_d_matmul error: C has illegal shape"
!
!INIT SCALAPACK TREATMENT:
!
!< Get coordinate of the processes
call blacs_gridinfo( p_context, p_Nx, p_Ny, rankX, rankY)
!
master = (rankX==0).AND.(rankY==0)
if(rankX<0.AND.rankY<0)goto 1010
!
Nb = Nblock
!
Nrows = numroc(N, Nb, rankX, 0, p_Nx)
Kcols = numroc(K, Nb, rankY, 0, p_Ny)
Krows = numroc(K, Nb, rankX, 0, p_Nx)
Mcols = numroc(M, Nb, rankY, 0, p_Ny)
!
if(master)then
unit = free_unit()
open(unit,file="p_dgemm.info")
write(unit,"(A20,I8,A5,I8)")"Grid=",p_Nx,"x",p_Ny
write(unit,"(A20,I8,A5,I8)")"Nrows x Kcols=",Nrows,"x",Kcols
write(unit,"(A20,I8,A5,I8)")"Krows x Mcols=",Krows,"x",Mcols
endif
!
!< allocate local distributed A
!
call descinit( descAloc, N, K, Nb, Nb, 0, 0, p_context, Nrows, info )
call descinit( descBloc, K, M, Nb, Nb, 0, 0, p_context, Krows, info )
call descinit( descCloc, N, M, Nb, Nb, 0, 0, p_context, Nrows, info )
!
!< Distribute A,B; allocate C
allocate(Aloc(Nrows,Kcols));Aloc=zero
allocate(Bloc(Krows,Mcols));Bloc=zero
allocate(Cloc(Nrows,Mcols));Cloc=zero
call Distribute_BLACS(A,Aloc,descAloc,unit)
call Distribute_BLACS(B,Bloc,descBloc,unit)
!
if(master)call cpu_time(t_stop)
if(master)write(unit,"(A20,F21.12)")"Time Distribute A,B:",t_stop-t_start
!
if(master)call cpu_time(t_start)
call PZGEMM( 'No transpose', 'No transpose', N, M, K, &
alfa_, &
Aloc, 1, 1, descAloc, &
Bloc, 1, 1, descBloc, &
beta_, &
Cloc, 1, 1, descCloc )
!
if(master)call cpu_time(t_stop)
if(master)write(unit,"(A20,F21.12)")"Time matmul C_loc:",t_stop-t_start
!
C=dcmplx(0d0,0d0)
call descinit( descC, N, M, Nb, Nb, 0, 0, p_context, Nrows, info )
call Gather_BLACS(Cloc,C,descC,unit)
!
if(master)close(unit)
1010 continue
return
!
end subroutine p_z_matmul