!{\src2tex{textfont=tt}}
!!****f* ABINIT/projbd
!!
!! NAME
!! projbd
!!
!! FUNCTION
!! Project out of vector "direc" the bands contained in "cg".
!! if useoverlap==0
!!  New direc=direc-$sum_{j} { <cg_{j}|direc>.|cg_{j}> }$
!! if useoverlap==1 (use of overlap matrix S)
!!  New direc=direc-$sum_{j/=i} { <cg_{j}|S|direc>.|cg_{j}> }$
!!
!! COPYRIGHT
!! Copyright (C) 1998-2007 ABINIT group (XG, MT)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
!!
!! INPUTS
!!  cg(2,mcg)=wavefunction coefficients for ALL bands
!!  iband0=which particular band we are interested in
!!         ("i" in the above formula)
!!         Can be set to -1 to sum over all bands...
!!  icg=shift to be given to the location of the data in cg
!!  iscg=shift to be given to the location of the data in cg
!!  istwf_k=option parameter that describes the storage of wfs
!!  mcg=maximum size of second dimension of cg
!!  mpi_enreg=informations about MPI parallelization
!!  mscg=maximum size of second dimension of scg
!!  nband=number of bands
!!  npw=number of planewaves
!!  ortalg= choice of algorithm for the projection
!!   Note : negative values are used outside to select whether
!!   the perpendicular projection is done prior preconditioning.
!!  print= if 1, print intermediate dot products
!!  scg(2,mscg*useoverlap)=<G|S|band> for ALL bands,
!!                        where S is an overlap matrix
!!  tim_projbd=timing code of the calling subroutine(can be set to 0 if not attributed)
!!  useoverlap=describe the overlap of wavefunctions:
!!               0: no overlap (S=Identity_matrix)
!!               1: wavefunctions are overlapping
!!
!! OUTPUT
!!  scprod(2,nband)=scalar_product
!!   if useoverlap==1
!!    scalar_product_i=$<cg_{j}|S|direc_{i}>$
!!   if useoverlap==0
!!    scalar_product_i=$<cg_{j}|direc_{i}>$
!!
!! SIDE EFFECTS
!!  direc(2,npw)= input: vector to be orthogonalised with respect to cg (and S)
!!                output: vector that has been orthogonalized wrt cg (and S)
!!
!! NOTES
!!  XG030513: MPIWF Might have to be recoded for efficient paralellism
!!
!! PARENTS
!!      cgwf,cgwf3
!!
!! CHILDREN
!!      timab,wrtout,xcomm_init,xsum_mpi
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine projbd(cg,direc,iband0,icg,iscg,istwf_k,mcg,mpi_enreg,mscg,nband,&
&                 npw,ortalg,print,scg,scprod,tim_projbd,useoverlap)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!This type is defined in defs_mpi
!scalars
 integer,intent(in) :: iband0,icg,iscg,istwf_k,mcg,mscg,nband,npw,ortalg,print
 integer,intent(in) :: tim_projbd,useoverlap
 type(MPI_type),intent(inout) :: mpi_enreg
!arrays
 real(dp),intent(in) :: cg(2,mcg),scg(2,mscg*useoverlap)
 real(dp),intent(inout) :: direc(2,npw)
 real(dp),intent(out) :: scprod(2,nband)

!Local variables-------------------------------
!scalars
 integer :: iband,iband2,iblock,ierr,index,index2,index3,index4,ipw,ipw1,ishft
 integer :: nbandm,old_paral_level,spaceComm
 real(dp) :: ai,ai2,ai3,ai4,ar,ar2,ar3,ar4,cg_im,cg_im2,cg_im3,cg_im4,cg_re
 real(dp) :: cg_re2,cg_re3,cg_re4,direc_im,direc_re,scg_im,scg_im2,scg_im3
 real(dp) :: scg_im4,scg_re,scg_re2,scg_re3,scg_re4
 character(len=500) :: message
!arrays
 real(dp) :: buffer2(2),tsec(2)
 real(dp),allocatable :: atab(:)

! *************************************************************************
!

! DEBUG
!write(6,*)' projbd : enter '
!stop
! ENDDEBUG
 old_paral_level=mpi_enreg%paral_level
 mpi_enreg%paral_level=3
 call timab(210+tim_projbd,1,tsec)

 ishft=icg;if (useoverlap==1) ishft=iscg

!Here the common coding
 if(ortalg==0 .or. ortalg==1 .or. ortalg==-1 )then

  nbandm=nband


  if(istwf_k==1)then

   allocate(atab(2*nbandm));atab=zero
   do iband=1,nbandm
    ar=zero ; ai=zero
    iband2=2*iband-1
    index=npw*(iband-1)+ishft

    if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(ipw) REDUCTION(+:ai,ar) &
!$OMP&SHARED(scg,direc,index,npw)
     do ipw=1,npw
      ar=ar+scg(1,index+ipw)*direc(1,ipw)+scg(2,index+ipw)*direc(2,ipw)
      ai=ai-scg(2,index+ipw)*direc(1,ipw)+scg(1,index+ipw)*direc(2,ipw)
     end do
!$OMP END PARALLEL DO
    else
!$OMP PARALLEL DO PRIVATE(ipw) REDUCTION(+:ai,ar) &
!$OMP&SHARED(cg,direc,index,npw)
     do ipw=1,npw
      ar=ar+cg(1,index+ipw)*direc(1,ipw)+cg(2,index+ipw)*direc(2,ipw)
      ai=ai-cg(2,index+ipw)*direc(1,ipw)+cg(1,index+ipw)*direc(2,ipw)
     end do
!$OMP END PARALLEL DO
    end if
    atab(iband2)=ar;atab(iband2+1)=ai
   end do  ! Loop on iband

!XG030513 : MPIWF reduction on the ar,ai is needed here
   if (mpi_enreg%paral_compil_fft==1) then
    call xcomm_init(mpi_enreg,spaceComm)
    call timab(48,1,tsec)
    call xsum_mpi(atab,spaceComm,ierr)
    call timab(48,2,tsec)
   end if
   scprod(:,1:nbandm)=reshape(atab(1:2*nbandm),(/2,nbandm/))

   do iband=1,nbandm
    if ((useoverlap==1).and.(iband==iband0)) cycle
    index=npw*(iband-1)+icg
    ar=scprod(1,iband);ai=scprod(2,iband)

!$OMP PARALLEL DO PRIVATE(ipw,cg_re,cg_im) &
!$OMP&SHARED(ar,ai,cg,direc,index,npw)
    do ipw=1,npw
     cg_re=cg(1,index+ipw) ; cg_im=cg(2,index+ipw)
     direc(1,ipw)=direc(1,ipw)-ar*cg_re+ai*cg_im
     direc(2,ipw)=direc(2,ipw)-ar*cg_im-ai*cg_re
    end do
!$OMP END PARALLEL DO

    if(print==1)then
     write(message,'(a,i3,2f14.6)') &
&     'projbd : called from cgwf ; iband,ar,ai=',iband,ar,ai
     call wrtout(06,message,'PERS')
    end if

   end do  ! Loop on iband
   deallocate(atab)

  else if(istwf_k>=2)then

   allocate(atab(nbandm));atab=zero
   do iband=1,nbandm
    index=npw*(iband-1)+ishft

    if(istwf_k==2 .and. mpi_enreg%me_g0==1)then
     if (useoverlap==1) then
      ar=half*scg(1,index+1)*direc(1,1)
     else
      ar=half*cg(1,index+1)*direc(1,1)
     end if
     ipw1=2
    else
     ar=zero ; ipw1=1
    end if

    if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(ipw) REDUCTION(+:ar) &
!$OMP&SHARED(scg,direc,index,ipw1,npw)
     do ipw=ipw1,npw
      ar=ar+scg(1,index+ipw)*direc(1,ipw)+scg(2,index+ipw)*direc(2,ipw)
     end do
!$OMP END PARALLEL DO
    else
!$OMP PARALLEL DO PRIVATE(ipw) REDUCTION(+:ar) &
!$OMP&SHARED(cg,direc,index,ipw1,npw)
     do ipw=ipw1,npw
      ar=ar+cg(1,index+ipw)*direc(1,ipw)+cg(2,index+ipw)*direc(2,ipw)
     end do
!$OMP END PARALLEL DO
    end if
    atab(iband)=two*ar
   end do  ! Loop on iband


!XG030513 : MPIWF reduction on ar is needed here
   if (mpi_enreg%paral_compil_fft==1) then
    call xcomm_init(mpi_enreg,spaceComm)
    call timab(48,1,tsec)
    call xsum_mpi(atab,spaceComm,ierr)
    call timab(48,2,tsec)
   end if
   scprod(1,1:nbandm)=atab(1:nbandm)
   scprod(2,1:nbandm)=zero

   do iband=1,nbandm
    if ((useoverlap==1).and.(iband==iband0)) cycle
    index=npw*(iband-1)+icg
    ar=scprod(1,iband)

!$OMP PARALLEL DO PRIVATE(ipw) &
!$OMP&SHARED(ar,cg,direc,index,npw)
    do ipw=1,npw
     direc(1,ipw)=direc(1,ipw)-ar*cg(1,index+ipw)
     direc(2,ipw)=direc(2,ipw)-ar*cg(2,index+ipw)
    end do
!$OMP END PARALLEL DO

    if(print==1)then
     write(message,'(a,i3,f14.6)') &
&     'projbd : called from cgwf ; iband,ar=',iband,ar
     call wrtout(06,message,'PERS')
    end if

   end do  ! Loop on iband
   deallocate(atab)
  end if  ! Test on istwf_k

!Here better use of the registers
 else

  if (ortalg==2 .or. ortalg==-2)then

   iblock=2;nbandm=(nband/iblock)*iblock

   if(nband>=iblock)then

    if(istwf_k==1)then

     allocate(atab(2*nbandm));atab=zero
     do iband=1,nbandm,iblock
      ar=zero  ; ai=zero
      ar2=zero ; ai2=zero
      iband2=2*iband-1
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_im,scg_im2) &
!$OMP&PRIVATE(direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ar,ar2) &
!$OMP&SHARED(scg,direc,index,index2,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re =scg(1,index+ipw)  ; scg_im =scg(2,index+ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        ar=ar+scg_re*direc_re+scg_im*direc_im
        ai=ai-scg_im*direc_re+scg_re*direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
        ai2=ai2-scg_im2*direc_re+scg_re2*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_im,cg_im2) &
!$OMP&PRIVATE(direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ar,ar2) &
!$OMP&SHARED(cg,direc,index,index2,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        ar=ar+cg_re*direc_re+cg_im*direc_im
        ai=ai-cg_im*direc_re+cg_re*direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
        ai2=ai2-cg_im2*direc_re+cg_re2*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband2  )=ar ;atab(iband2+1)=ai
      atab(iband2+2)=ar2;atab(iband2+3)=ai2
     end do  ! Loop on iband

!XG030513 : MPIWF reduction on the ar,ar2,ai,ai2 is needed here
   if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(:,1:nbandm)=reshape(atab(1:2*nbandm),(/2,nbandm/))

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      ar =scprod(1,iband)  ;ai =scprod(2,iband)
      ar2=scprod(1,iband+1);ai2=scprod(2,iband+1)
      if ((useoverlap==1).and.(iband==iband0)) then
       ar=zero;ai=zero
      end if
      if ((useoverlap==1).and.(iband+1==iband0)) then
       ar2=zero;ai2=zero
      end if

!$OMP PARALLEL DO PRIVATE(ipw,cg_re,cg_re2,cg_im,cg_im2) &
!$OMP&SHARED(ar,ai,ar2,ai2,cg,direc,index,index2,npw)
      do ipw=1,npw
       cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       direc(1,ipw)=direc(1,ipw)-ar*cg_re+ai*cg_im-ar2*cg_re2+ai2*cg_im2
       direc(2,ipw)=direc(2,ipw)-ar*cg_im-ai*cg_re-ar2*cg_im2-ai2*cg_re2
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)

    else if(istwf_k>=2)then

     allocate(atab(nbandm));atab=zero
     do iband=1,nbandm,iblock
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft

      if(istwf_k==2 .and. mpi_enreg%me_g0==1)then
       if (useoverlap==1) then
        ar =half*scg(1,index +1)*direc(1,1)
        ar2=half*scg(1,index2+1)*direc(1,1)
       else
        ar =half*cg(1,index +1)*direc(1,1)
        ar2=half*cg(1,index2+1)*direc(1,1)
       end if
       ipw1=2
      else
       ar=zero ; ar2=zero ; ipw1=1
      end if

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_im,scg_im2) &
!$OMP&PRIVATE(direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2) &
!$OMP&SHARED(scg,direc,index,index2,ipw1,npw)
      do ipw=ipw1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re =scg(1,index +ipw) ; scg_im =scg(2,index +ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        ar =ar +scg_re *direc_re+scg_im *direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_im,cg_im2) &
!$OMP&PRIVATE(direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2) &
!$OMP&SHARED(cg,direc,index,index2,ipw1,npw)
      do ipw=ipw1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        ar =ar+cg_re *direc_re +cg_im *direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband  )=two*ar
      atab(iband+1)=two*ar2
     end do

!XG030513 : MPIWF reduction on the ar,ar2 is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(1,1:nbandm)=atab(1:nbandm)
     scprod(2,1:nbandm)=zero

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      ar =scprod(1,iband)
      ar2=scprod(1,iband+1)
      if ((useoverlap==1).and.(iband  ==iband0)) ar =zero
      if ((useoverlap==1).and.(iband+1==iband0)) ar2=zero

!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_im,cg_im2) &
!$OMP&SHARED(ar,ar2,cg,direc,index,index2,npw)
      do ipw=1,npw
       cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       direc(1,ipw)=direc(1,ipw)-ar*cg_re-ar2*cg_re2
       direc(2,ipw)=direc(2,ipw)-ar*cg_im-ar2*cg_im2
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)
    end if  ! Test on istwf_k
   end if  ! Test on nband

  else if (ortalg==3 .or. ortalg==-3)then

   iblock=3;nbandm=(nband/iblock)*iblock

   if(nband>=iblock)then

    if(istwf_k==1)then

     allocate(atab(2*nbandm));atab=zero
     do iband=1,nbandm,iblock
      ar=zero  ; ai=zero
      ar2=zero ; ai2=zero
      ar3=zero ; ai3=zero
      iband2=2*iband-1
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft
      index3=npw*(iband+1)+ishft

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_re3,scg_im,scg_im2) &
!$OMP&PRIVATE(scg_im3,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ai3,ar,ar2,ar3) &
!$OMP&SHARED(scg,direc,index,index2,index3,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re =scg(1,index +ipw) ; scg_im =scg(2,index +ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        scg_re3=scg(1,index3+ipw) ; scg_im3=scg(2,index3+ipw)
        ar=ar+scg_re*direc_re+scg_im*direc_im
        ai=ai-scg_im*direc_re+scg_re*direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
        ai2=ai2-scg_im2*direc_re+scg_re2*direc_im
        ar3=ar3+scg_re3*direc_re+scg_im3*direc_im
        ai3=ai3-scg_im3*direc_re+scg_re3*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_im,cg_im2) &
!$OMP&PRIVATE(cg_im3,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ai3,ar,ar2,ar3) &
!$OMP&SHARED(cg,direc,index,index2,index3,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
        ar=ar+cg_re*direc_re+cg_im*direc_im
        ai=ai-cg_im*direc_re+cg_re*direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
        ai2=ai2-cg_im2*direc_re+cg_re2*direc_im
        ar3=ar3+cg_re3*direc_re+cg_im3*direc_im
        ai3=ai3-cg_im3*direc_re+cg_re3*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband2  )=ar ;atab(iband2+1)=ai
      atab(iband2+2)=ar2;atab(iband2+3)=ai2
      atab(iband2+4)=ar3;atab(iband2+5)=ai3
     end do  ! Loop on iband

!XG030513 : MPIWF reduction on the ar,ar2,ar3,ai,ai2,ar3 is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(:,1:nbandm)=reshape(atab(1:2*nbandm),(/2,nbandm/))

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      index3=npw*(iband+1)+icg
      ar =scprod(1,iband)  ;ai =scprod(2,iband)
      ar2=scprod(1,iband+1);ai2=scprod(2,iband+1)
      ar3=scprod(1,iband+2);ai3=scprod(2,iband+2)
      if ((useoverlap==1).and.(iband==iband0)) then
       ar=zero;ai=zero
      end if
      if ((useoverlap==1).and.(iband+1==iband0)) then
       ar2=zero;ai2=zero
      end if
      if ((useoverlap==1).and.(iband+2==iband0)) then
       ar3=zero;ai3=zero
      end if

!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_im,cg_im2) &
!$OMP&PRIVATE(cg_im3,ipw) &
!$OMP&SHARED(ar,ai,ar2,ai2,ar3,ai3,cg,direc,index,index2,index3,npw)
      do ipw=1,npw
       cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
       direc(1,ipw)=direc(1,ipw)-ar *cg_re +ai *cg_im-ar2*cg_re2+ai2*cg_im2&
&                               -ar3*cg_re3+ai3*cg_im3
       direc(2,ipw)=direc(2,ipw)-ar *cg_im -ai *cg_re-ar2*cg_im2-ai2*cg_re2&
&                               -ar3*cg_im3-ai3*cg_re3
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)

    else if(istwf_k>=2)then

     allocate(atab(nbandm));atab=zero
     do iband=1,nbandm,iblock
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft
      index3=npw*(iband+1)+ishft

      if(istwf_k==2 .and. mpi_enreg%me_g0==1)then
       if (useoverlap==1) then
        ar =half*scg(1,index +1)*direc(1,1)
        ar2=half*scg(1,index2+1)*direc(1,1)
        ar3=half*scg(1,index3+1)*direc(1,1)
       else
        ar =half*cg(1,index +1)*direc(1,1)
        ar2=half*cg(1,index2+1)*direc(1,1)
        ar3=half*cg(1,index3+1)*direc(1,1)
       end if
       ipw1=2
      else
       ar=zero ; ar2=zero ; ar3=zero ; ipw1=1
      end if

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_re3,scg_im,scg_im2) &
!$OMP&PRIVATE(scg_im3,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2,ar3) &
!$OMP&SHARED(scg,direc,index,index2,index3,npw)
       do ipw=ipw1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re=scg(1,index+ipw)   ; scg_im=scg(2,index+ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        scg_re3=scg(1,index3+ipw) ; scg_im3=scg(2,index3+ipw)
        ar =ar +scg_re *direc_re+scg_im *direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
        ar3=ar3+scg_re3*direc_re+scg_im3*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_im,cg_im2) &
!$OMP&PRIVATE(cg_im3,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2,ar3) &
!$OMP&SHARED(cg,direc,index,index2,index3,npw)
       do ipw=ipw1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
        ar=ar+cg_re*direc_re+cg_im*direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
        ar3=ar3+cg_re3*direc_re+cg_im3*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband  )=two*ar
      atab(iband+1)=two*ar2
      atab(iband+2)=two*ar3
     end do

!XG030513 : MPIWF reduction on the ar,ar2,a3 is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(1,1:nbandm)=atab(1:nbandm)
     scprod(2,1:nbandm)=zero

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      index3=npw*(iband+1)+icg
      ar =scprod(1,iband)
      ar2=scprod(1,iband+1)
      ar3=scprod(1,iband+2)
      if ((useoverlap==1).and.(iband  ==iband0)) ar =zero
      if ((useoverlap==1).and.(iband+1==iband0)) ar2=zero
      if ((useoverlap==1).and.(iband+2==iband0)) ar3=zero

!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_im,cg_im2) &
!$OMP&PRIVATE(cg_im3,direc_re,direc_im,ipw) &
!$OMP&SHARED(ar,ar2,ar3,cg,direc,index,index2,index3,npw)
      do ipw=1,npw
       cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
       direc(1,ipw)=direc(1,ipw)-ar*cg_re-ar2*cg_re2-ar3*cg_re3
       direc(2,ipw)=direc(2,ipw)-ar*cg_im-ar2*cg_im2-ar3*cg_im3
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)
    end if  ! Test on istwf_k
   end if  ! Test on nband

  else if (ortalg==4 .or. ortalg==-4)then

   iblock=4;nbandm=(nband/iblock)*iblock

   if(nband>=iblock)then

    if(istwf_k==1)then

     allocate(atab(2*nbandm));atab=zero
     do iband=1,nbandm,iblock
      ar=zero  ; ai=zero
      ar2=zero ; ai2=zero
      ar3=zero ; ai3=zero
      ar4=zero ; ai4=zero
      iband2=2*iband-1
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft
      index3=npw*(iband+1)+ishft
      index4=npw*(iband+2)+ishft

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_re3,scg_re4,scg_im) &
!$OMP&PRIVATE(scg_im2,scg_im3,scg_im4,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ai3,ai4,ar,ar2,ar3,ar4) &
!$OMP&SHARED(scg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re=scg(1,index+ipw)   ; scg_im=scg(2,index+ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        scg_re3=scg(1,index3+ipw) ; scg_im3=scg(2,index3+ipw)
        scg_re4=scg(1,index4+ipw) ; scg_im4=scg(2,index4+ipw)
        ar=ar+scg_re*direc_re+scg_im*direc_im
        ai=ai-scg_im*direc_re+scg_re*direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
        ai2=ai2-scg_im2*direc_re+scg_re2*direc_im
        ar3=ar3+scg_re3*direc_re+scg_im3*direc_im
        ai3=ai3-scg_im3*direc_re+scg_re3*direc_im
        ar4=ar4+scg_re4*direc_re+scg_im4*direc_im
        ai4=ai4-scg_im4*direc_re+scg_re4*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_re4,cg_im) &
!$OMP&PRIVATE(cg_im2,cg_im3,cg_im4,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ai2,ai3,ai4,ar,ar2,ar3,ar4) &
!$OMP&SHARED(cg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
       do ipw=1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
        cg_re4=cg(1,index4+ipw) ; cg_im4=cg(2,index4+ipw)
        ar=ar+cg_re*direc_re+cg_im*direc_im
        ai=ai-cg_im*direc_re+cg_re*direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
        ai2=ai2-cg_im2*direc_re+cg_re2*direc_im
        ar3=ar3+cg_re3*direc_re+cg_im3*direc_im
        ai3=ai3-cg_im3*direc_re+cg_re3*direc_im
        ar4=ar4+cg_re4*direc_re+cg_im4*direc_im
        ai4=ai4-cg_im4*direc_re+cg_re4*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband2  )=ar ;atab(iband2+1)=ai
      atab(iband2+2)=ar2;atab(iband2+3)=ai2
      atab(iband2+4)=ar3;atab(iband2+5)=ai3
      atab(iband2+6)=ar4;atab(iband2+7)=ai4
     end do

!XG030513 : MPIWF reduction on the ar,ar2,ar3,ar4,ai,ai2,ai3,ai4 is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(:,1:nbandm)=reshape(atab(1:2*nbandm),(/2,nbandm/))

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      index3=npw*(iband+1)+icg
      index4=npw*(iband+2)+icg
      ar =scprod(1,iband)  ;ai =scprod(2,iband)
      ar2=scprod(1,iband+1);ai2=scprod(2,iband+1)
      ar3=scprod(1,iband+2);ai3=scprod(2,iband+2)
      ar4=scprod(1,iband+3);ai4=scprod(2,iband+3)
      if ((useoverlap==1).and.(iband==iband0)) then
       ar=zero;ai=zero
      end if
      if ((useoverlap==1).and.(iband+1==iband0)) then
       ar2=zero;ai2=zero
      end if
      if ((useoverlap==1).and.(iband+2==iband0)) then
       ar3=zero;ai3=zero
      end if
      if ((useoverlap==1).and.(iband+3==iband0)) then
       ar4=zero;ai4=zero
      end if

!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_re4,cg_im) &
!$OMP&PRIVATE(cg_im2,cg_im3,cg_im4,ipw) &
!$OMP&SHARED(ar,ai,ar2,ai2,ar3,ai3,ar4,ai4,cg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
      do ipw=1,npw
       cg_re =cg(1,index +ipw) ; cg_im =cg(2,index +ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
       cg_re4=cg(1,index4+ipw) ; cg_im4=cg(2,index4+ipw)
       direc(1,ipw)=direc(1,ipw)-ar *cg_re +ai *cg_im -ar2*cg_re2+ai2*cg_im2&
&                               -ar3*cg_re3+ai3*cg_im3-ar4*cg_re4+ai4*cg_im4
       direc(2,ipw)=direc(2,ipw)-ar *cg_im -ai *cg_re -ar2*cg_im2-ai2*cg_re2&
&                               -ar3*cg_im3-ai3*cg_re3-ar4*cg_im4-ai4*cg_re4
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)

    else if(istwf_k>=2)then

     allocate(atab(nbandm));atab=zero
     do iband=1,nbandm,iblock
      index=npw*(iband-1)+ishft
      index2=npw*iband+ishft
      index3=npw*(iband+1)+ishft
      index4=npw*(iband+2)+ishft

      if(istwf_k==2 .and. mpi_enreg%me_g0==1)then
       if (useoverlap==1) then
        ar =half*scg(1,index +1)*direc(1,1)
        ar2=half*scg(1,index2+1)*direc(1,1)
        ar3=half*scg(1,index3+1)*direc(1,1)
        ar4=half*scg(1,index4+1)*direc(1,1)
       else
        ar =half*cg(1,index +1)*direc(1,1)
        ar2=half*cg(1,index2+1)*direc(1,1)
        ar3=half*cg(1,index3+1)*direc(1,1)
        ar4=half*cg(1,index4+1)*direc(1,1)
       end if
       ipw1=2
      else
       ar=zero ; ar2=zero ; ar3=zero ; ar4=zero ; ipw1=1
      end if

      if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_re2,scg_re3,scg_re4,scg_im) &
!$OMP&PRIVATE(scg_im2,scg_im3,scg_im4,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2,ar3,ar4) &
!$OMP&SHARED(scg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
       do ipw=ipw1,npw
        direc_re=direc(1,ipw)     ; direc_im=direc(2,ipw)
        scg_re=scg(1,index+ipw)   ; scg_im=scg(2,index+ipw)
        scg_re2=scg(1,index2+ipw) ; scg_im2=scg(2,index2+ipw)
        scg_re3=scg(1,index3+ipw) ; scg_im3=scg(2,index3+ipw)
        scg_re4=scg(1,index4+ipw) ; scg_im4=scg(2,index4+ipw)
        ar=ar+scg_re*direc_re+scg_im*direc_im
        ar2=ar2+scg_re2*direc_re+scg_im2*direc_im
        ar3=ar3+scg_re3*direc_re+scg_im3*direc_im
        ar4=ar4+scg_re4*direc_re+scg_im4*direc_im
       end do
!$OMP END PARALLEL DO
      else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_re4,cg_im) &
!$OMP&PRIVATE(cg_im2,cg_im3,cg_im4,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar,ar2,ar3,ar4) &
!$OMP&SHARED(cg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
       do ipw=ipw1,npw
        direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
        cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
        cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
        cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
        cg_re4=cg(1,index4+ipw) ; cg_im4=cg(2,index4+ipw)
        ar=ar+cg_re*direc_re+cg_im*direc_im
        ar2=ar2+cg_re2*direc_re+cg_im2*direc_im
        ar3=ar3+cg_re3*direc_re+cg_im3*direc_im
        ar4=ar4+cg_re4*direc_re+cg_im4*direc_im
       end do
!$OMP END PARALLEL DO
      end if
      atab(iband  )=two*ar
      atab(iband+1)=two*ar2
      atab(iband+2)=two*ar3
      atab(iband+3)=two*ar4
     end do

!XG030513 : MPIWF reduction on the ar,ar2,ar3,ar4 is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      call timab(48,1,tsec)
      call xsum_mpi(atab,spaceComm,ierr)
      call timab(48,2,tsec)
     end if
     scprod(1,1:nbandm)=atab(1:nbandm)
     scprod(2,1:nbandm)=zero

     do iband=1,nbandm,iblock
      index=npw*(iband-1)+icg
      index2=npw*iband+icg
      index3=npw*(iband+1)+icg
      index4=npw*(iband+2)+icg
      ar =scprod(1,iband)
      ar2=scprod(1,iband+1)
      ar3=scprod(1,iband+2)
      ar4=scprod(1,iband+3)
      if ((useoverlap==1).and.(iband  ==iband0)) ar =zero
      if ((useoverlap==1).and.(iband+1==iband0)) ar2=zero
      if ((useoverlap==1).and.(iband+2==iband0)) ar3=zero
      if ((useoverlap==1).and.(iband+3==iband0)) ar4=zero

!$OMP PARALLEL DO PRIVATE(cg_re,cg_re2,cg_re3,cg_re4,cg_im) &
!$OMP&PRIVATE(cg_im2,cg_im3,cg_im4,ipw) &
!$OMP&SHARED(ar,ar2,ar3,ar4,cg,direc) &
!$OMP&SHARED(index,index2,index3,index4,npw)
      do ipw=1,npw
       cg_re=cg(1,index+ipw)   ; cg_im=cg(2,index+ipw)
       cg_re2=cg(1,index2+ipw) ; cg_im2=cg(2,index2+ipw)
       cg_re3=cg(1,index3+ipw) ; cg_im3=cg(2,index3+ipw)
       cg_re4=cg(1,index4+ipw) ; cg_im4=cg(2,index4+ipw)
       direc(1,ipw)=direc(1,ipw)-ar *cg_re -ar2*cg_re2 &
&                               -ar3*cg_re3-ar4*cg_re4
       direc(2,ipw)=direc(2,ipw)-ar *cg_im -ar2*cg_im2 &
&                               -ar3*cg_im3-ar4*cg_im4
      end do
!$OMP END PARALLEL DO
     end do  ! Loop on iband
     deallocate(atab)
    end if  ! Test on istwf_k
   end if  ! Test on nband
  end if  ! Test on ortalg=2, 3 or 4

! Need to treat the bands not yet treated
  if( nbandm /= nband )then
   if(istwf_k==1)then
    do iband=nbandm+1,nband
     ar=zero ; ai=zero
     index=npw*(iband-1)+ishft

     if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_im,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ar) &
!$OMP&SHARED(scg,direc,index,npw)
      do ipw=1,npw
       direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
       scg_re=scg(1,index+ipw) ; scg_im=scg(2,index+ipw)
       ar=ar+scg_re*direc_re+scg_im*direc_im
       ai=ai-scg_im*direc_re+scg_re*direc_im
      end do
!$OMP END PARALLEL DO
     else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_im,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ai,ar) &
!$OMP&SHARED(cg,direc,index,npw)
      do ipw=1,npw
       direc_re=direc(1,ipw) ; direc_im=direc(2,ipw)
       cg_re=cg(1,index+ipw) ; cg_im=cg(2,index+ipw)
       ar=ar+cg_re*direc_re+cg_im*direc_im
       ai=ai-cg_im*direc_re+cg_re*direc_im
      end do
!$OMP END PARALLEL DO
     end if

!XG030513 : MPIWF reduction on the ar,ai is needed here
     if (mpi_enreg%paral_compil_fft==1) then
      call xcomm_init(mpi_enreg,spaceComm)
      buffer2(1)=ar;buffer2(2)=ai
      call timab(48,1,tsec)
      call xsum_mpi(buffer2,spaceComm,ierr)
      call timab(48,2,tsec)
      ar=buffer2(1);ai=buffer2(2)
     end if

     scprod(1,iband)=ar  ; scprod(2,iband)=ai
     if ((useoverlap==1).and.(iband==iband0)) cycle
     index=npw*(iband-1)+icg

!$OMP PARALLEL DO PRIVATE(cg_re,cg_im,ipw) &
!$OMP&SHARED(ai,ar,cg,direc,index,npw)
     do ipw=1,npw
      cg_re=cg(1,index+ipw) ; cg_im=cg(2,index+ipw)
      direc(1,ipw)=direc(1,ipw)-ar*cg_re+ai*cg_im
      direc(2,ipw)=direc(2,ipw)-ar*cg_im-ai*cg_re
     end do
!$OMP END PARALLEL DO
   end do

   else if(istwf_k>=2)then

    do iband=nbandm+1,nband
     index=npw*(iband-1)+ishft
     if(istwf_k==2 .and. mpi_enreg%me_g0==1)then
      if (useoverlap==1) then
       ar=half*scg(1,1+index)*direc(1,1)
      else
       ar=half*cg(1,1+index)*direc(1,1)
      end if
      ipw1=2
     else
      ar=zero ; ipw1=1
     end if

     if (useoverlap==1) then
!$OMP PARALLEL DO PRIVATE(scg_re,scg_im,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar) &
!$OMP&SHARED(scg,direc,index,npw)
      do ipw=ipw1,npw
       direc_re=direc(1,ipw)   ; direc_im=direc(2,ipw)
       scg_re=scg(1,index+ipw) ; scg_im=scg(2,index+ipw)
       ar=ar+scg_re*direc_re+scg_im*direc_im
      end do
!$OMP END PARALLEL DO
     else
!$OMP PARALLEL DO PRIVATE(cg_re,cg_im,direc_re,direc_im,ipw) &
!$OMP&REDUCTION(+ :ar) &
!$OMP&SHARED(cg,direc,index,npw)
      do ipw=ipw1,npw
       direc_re=direc(1,ipw) ; direc_im=direc(2,ipw)
       cg_re=cg(1,index+ipw) ; cg_im=cg(2,index+ipw)
       ar=ar+cg_re*direc_re+cg_im*direc_im
      end do
!$OMP END PARALLEL DO
     end if
     ar=two*ar

!XG030513 : MPIWF reduction on the ar is needed here
    if (mpi_enreg%paral_compil_fft==1) then
     call xcomm_init(mpi_enreg,spaceComm)
     call timab(48,1,tsec)
     call xsum_mpi(ar,spaceComm,ierr)
     call timab(48,2,tsec)
    end if
    scprod(1,iband)=ar   ; scprod(2,iband)=zero
    if ((useoverlap==1).and.(iband==iband0)) cycle
    index=npw*(iband-1)+icg

!$OMP PARALLEL DO PRIVATE(cg_re,cg_im,ipw) &
!$OMP&SHARED(ar,cg,direc,index,npw)
     do ipw=1,npw
      cg_re=cg(1,index+ipw) ; cg_im=cg(2,index+ipw)
      direc(1,ipw)=direc(1,ipw)-ar*cg_re
      direc(2,ipw)=direc(2,ipw)-ar*cg_im
     end do
!$OMP END PARALLEL DO
    end do  ! Loop on iband
   end if  ! Test on istwf_k
  end if  ! Test on nband

 end if  ! on ortalg=0 or 1
 mpi_enreg%paral_level=old_paral_level
 call timab(210+tim_projbd,2,tsec)

!DEBUG
!write(6,*)' projbd: debug, enter.'
!ENDDEBUG


end subroutine projbd
!!***
