I have built this simple little program in an attempt to succesfully switch between nested and non-nested parallelism. This program is designed for a NUMA system with at least 6 numanodes, with 4 cores each, but if you have fewer NUMAnodes just change the number in the beginning...
So far I have been unable to figure out how to correctly combine Nested and non-nested parallelism.
Just a quick explanation of what this program does:
It sets the affinity for nested parallelism
Then it runs a nested parallel region (6 numa nodes with 4 cores on master thread and 3 on the others). (Here the load is distributed as desired across the NUMA nodes)
Then it runs a non-nested parallel region with 6*4 threads scattered across the NUMAnodes. (Here the distribution also works as intended)
Then it runs a second nested parallel region exactly similar to the first nested parallel region. (Now the load distribution is all fucked up, and all the work ends up on 2 of the NUMA nodes while the rest does nothing.)
So does anyone know how to switch between these different types of parallel regions? Is what I'm seeing a bug or what exactly is happening here?
program NumaAwareDGEMM
use IFPORT
use omp_lib
use mkl_service
use mTEST
implicit none
logical(4) :: Success
integer :: NoNUMANodes, blocksize,nrepeats,Runmode,t0
integer :: N,I,J,NIte, First,Last,k,colidx,error,numofblocks,iii,ii,dim,d,threadID,NumaID
integer :: Iter,Solver,NUMASize,m,ThreadsPrNuma,ID, NCPU
integer, allocatable,dimension(:) :: GlobalThreadID
real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8
real*8,allocatable,dimension(:,:,:) :: C
real*8,allocatable,dimension(:) :: tmp
logical, allocatable, dimension(:) :: NumaNodeDone,MKlbusy
NoNUMANodes=6 !How many NUMA nodes to distribute calculations over
NCPU=6*4
success = SETENVQQ("OMP_DISPLAY_ENV=TRUE")
success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6},{36:6},{42:6}")
!success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}")
!success=SETENVQQ("OMP_PLACES={0:8},{8:8},{16:8},{24:8},{32:8},{40:8}")
blocksize=600
dim=blocksize*NoNUMANodes
allocate(A(dim,dim))
allocate(B(dim,dim))
allocate(C1(dim,dim))
allocate(C2(dim,dim))
allocate(C3(dim,dim))
allocate(C4(dim,dim))
allocate(C5(dim,dim))
allocate(C6(dim,dim))
allocate(C7(dim,dim))
allocate(C8(dim,dim))
allocate(tmp(NCPU))
call KMP_SET_STACKSIZE_S(990000000)
call omp_set_dynamic(0)
call omp_set_nested(1)
!intialization region
call omp_set_num_threads(NoNUMANodes) !First we spawn all the threads in a threadpool
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
!$OMP DO SCHEDULE(STATIC)
do i = 1,NoNUMANodes
ID=omp_get_thread_num()
print *,'Thread binding for socket=',ID
if(i-1.ne.ID) print*,'ERROR on ID',ID,'i=',i
SELECT CASE (i)
CASE(1)
! success=SETENVQQ("OMP_PLACES={0:8}")
success=SETENVQQ("OMP_PLACES={0:6}")
CASE(2)
! success=SETENVQQ("OMP_PLACES={8:8}")
success=SETENVQQ("OMP_PLACES={6:6}")
CASE(3)
! success=SETENVQQ("OMP_PLACES={16:8}")
success=SETENVQQ("OMP_PLACES={12:6}")
CASE(4)
success=SETENVQQ("OMP_PLACES={18:6}")
! success=SETENVQQ("OMP_PLACES={24:8}")
CASE(5)
success=SETENVQQ("OMP_PLACES={24:6}")
! success=SETENVQQ("OMP_PLACES={32:8}")
CASE(6)
success=SETENVQQ("OMP_PLACES={30:6}")
! success=SETENVQQ("OMP_PLACES={40:8}")
CASE(7)
success=SETENVQQ("OMP_PLACES={36:6}")
! success=SETENVQQ("OMP_PLACES={48:8}")
CASE(8)
success=SETENVQQ("OMP_PLACES={42:6}")
! success=SETENVQQ("OMP_PLACES={56:8}")
END SELECT
end do
!$OMP END DO
!$OMP END PARALLEL
print*,'Initialization over'
!
call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
!$OMP DO SCHEDULE(STATIC)
do i = 1,NoNumanodes
ID=omp_get_thread_num()
SELECT CASE (i)
CASE(1)
call Products(dim,A,B,C1)
CASE(2)
call Products(dim,A,B,C2)
CASE(3)
call Products(dim,A,B,C3)
CASE(4)
call Products(dim,A,B,C4)
CASE(5)
call Products(dim,A,B,C5)
CASE(6)
call Products(dim,A,B,C6)
CASE(7)
call Products(dim,A,B,C7)
CASE(8)
call Products(dim,A,B,C8)
END SELECT
end do
!$OMP END DO
!$OMP END PARALLEL
print*,'First Nested done '
print*,'Starting single parallel region'
call omp_set_num_threads(NCPU)
print*,'Proc_bind',omp_get_proc_bind()
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k) proc_bind(Spread)
!$OMP DO SCHEDULE(STATIC)
do i=1,NCPU
k=0
do j=1,1000000000
k=k+exp((i*1d0))*exp(-(i*1d0))+(j**2)
end do
tmp(i)=k
end do
!$OMP END DO
!$OMP END PARALLEL
print*,'Single parallel region done'
call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) proc_bind(Spread)
!$OMP DO SCHEDULE(STATIC)
do i = 1,NoNumanodes
SELECT CASE (i)
CASE(1)
call Products(dim,A,B,C1)
CASE(2)
call Products(dim,A,B,C2)
CASE(3)
call Products(dim,A,B,C3)
CASE(4)
call Products(dim,A,B,C4)
CASE(5)
call Products(dim,A,B,C5)
CASE(6)
call Products(dim,A,B,C6)
CASE(7)
call Products(dim,A,B,C7)
CASE(8)
call Products(dim,A,B,C8)
END SELECT
end do
!$OMP END DO
!$OMP END PARALLEL
end program NumaAwareDGEMM
module mTEST
use omp_lib
contains
subroutine Products(n,A,B,C)
implicit none
real*8,dimension(:,:) :: A,B,C
integer :: n
integer :: i,j,k,ID
ID=omp_get_thread_num()
if (ID.eq.0) then
call omp_set_num_threads(4) !Inner parallelization
else
call omp_set_num_threads(3) !Inner parallelization
end if
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) PROC_BIND(MASTER)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
do j=1,n
do k=1,n
C(i,j)=A(i,j)*B(j,k)
end do
end do
end do
!$OMP END DO
!$OMP END PARALLEL
end subroutine Products
end module mTEST