I have built this simple little program in an attempt to succesfully switch between nested and non-nested parallelism. This program is designed for a NUMA system with at least 6 numanodes, with 4 cores each, but if you have fewer NUMAnodes just change the number in the beginning...
So far I have been unable to figure out how to correctly combine Nested and non-nested parallelism.
Just a quick explanation of what this program does:
It sets the affinity for nested parallelism
Then it runs a nested parallel region (6 numa nodes with 4 cores on master thread and 3 on the others). (Here the load is distributed as desired across the NUMA nodes)
Then it runs a non-nested parallel region with 6*4 threads scattered across the NUMAnodes. (Here the distribution also works as intended)
Then it runs a second nested parallel region exactly similar to the first nested parallel region. (Now the load distribution is all fucked up, and all the work ends up on 2 of the NUMA nodes while the rest does nothing.)
So does anyone know how to switch between these different types of parallel regions? Is what I'm seeing a bug or what exactly is happening here?
program NumaAwareDGEMM use IFPORT use omp_lib use mkl_service use mTEST implicit none logical(4) :: Success integer :: NoNUMANodes, blocksize,nrepeats,Runmode,t0 integer :: N,I,J,NIte, First,Last,k,colidx,error,numofblocks,iii,ii,dim,d,threadID,NumaID integer :: Iter,Solver,NUMASize,m,ThreadsPrNuma,ID, NCPU integer, allocatable,dimension(:) :: GlobalThreadID real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8 real*8,allocatable,dimension(:,:,:) :: C real*8,allocatable,dimension(:) :: tmp logical, allocatable, dimension(:) :: NumaNodeDone,MKlbusy NoNUMANodes=6 !How many NUMA nodes to distribute calculations over NCPU=6*4 success = SETENVQQ("OMP_DISPLAY_ENV=TRUE") success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6},{36:6},{42:6}") !success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}") !success=SETENVQQ("OMP_PLACES={0:8},{8:8},{16:8},{24:8},{32:8},{40:8}") blocksize=600 dim=blocksize*NoNUMANodes allocate(A(dim,dim)) allocate(B(dim,dim)) allocate(C1(dim,dim)) allocate(C2(dim,dim)) allocate(C3(dim,dim)) allocate(C4(dim,dim)) allocate(C5(dim,dim)) allocate(C6(dim,dim)) allocate(C7(dim,dim)) allocate(C8(dim,dim)) allocate(tmp(NCPU)) call KMP_SET_STACKSIZE_S(990000000) call omp_set_dynamic(0) call omp_set_nested(1) !intialization region call omp_set_num_threads(NoNUMANodes) !First we spawn all the threads in a threadpool !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNUMANodes ID=omp_get_thread_num() print *,'Thread binding for socket=',ID if(i-1.ne.ID) print*,'ERROR on ID',ID,'i=',i SELECT CASE (i) CASE(1) ! success=SETENVQQ("OMP_PLACES={0:8}") success=SETENVQQ("OMP_PLACES={0:6}") CASE(2) ! success=SETENVQQ("OMP_PLACES={8:8}") success=SETENVQQ("OMP_PLACES={6:6}") CASE(3) ! success=SETENVQQ("OMP_PLACES={16:8}") success=SETENVQQ("OMP_PLACES={12:6}") CASE(4) success=SETENVQQ("OMP_PLACES={18:6}") ! success=SETENVQQ("OMP_PLACES={24:8}") CASE(5) success=SETENVQQ("OMP_PLACES={24:6}") ! success=SETENVQQ("OMP_PLACES={32:8}") CASE(6) success=SETENVQQ("OMP_PLACES={30:6}") ! success=SETENVQQ("OMP_PLACES={40:8}") CASE(7) success=SETENVQQ("OMP_PLACES={36:6}") ! success=SETENVQQ("OMP_PLACES={48:8}") CASE(8) success=SETENVQQ("OMP_PLACES={42:6}") ! success=SETENVQQ("OMP_PLACES={56:8}") END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'Initialization over' ! call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes ID=omp_get_thread_num() SELECT CASE (i) CASE(1) call Products(dim,A,B,C1) CASE(2) call Products(dim,A,B,C2) CASE(3) call Products(dim,A,B,C3) CASE(4) call Products(dim,A,B,C4) CASE(5) call Products(dim,A,B,C5) CASE(6) call Products(dim,A,B,C6) CASE(7) call Products(dim,A,B,C7) CASE(8) call Products(dim,A,B,C8) END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'First Nested done ' print*,'Starting single parallel region' call omp_set_num_threads(NCPU) print*,'Proc_bind',omp_get_proc_bind() !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k) proc_bind(Spread) !$OMP DO SCHEDULE(STATIC) do i=1,NCPU k=0 do j=1,1000000000 k=k+exp((i*1d0))*exp(-(i*1d0))+(j**2) end do tmp(i)=k end do !$OMP END DO !$OMP END PARALLEL print*,'Single parallel region done' call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) proc_bind(Spread) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes SELECT CASE (i) CASE(1) call Products(dim,A,B,C1) CASE(2) call Products(dim,A,B,C2) CASE(3) call Products(dim,A,B,C3) CASE(4) call Products(dim,A,B,C4) CASE(5) call Products(dim,A,B,C5) CASE(6) call Products(dim,A,B,C6) CASE(7) call Products(dim,A,B,C7) CASE(8) call Products(dim,A,B,C8) END SELECT end do !$OMP END DO !$OMP END PARALLEL end program NumaAwareDGEMM module mTEST use omp_lib contains subroutine Products(n,A,B,C) implicit none real*8,dimension(:,:) :: A,B,C integer :: n integer :: i,j,k,ID ID=omp_get_thread_num() if (ID.eq.0) then call omp_set_num_threads(4) !Inner parallelization else call omp_set_num_threads(3) !Inner parallelization end if !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) PROC_BIND(MASTER) !$OMP DO SCHEDULE(STATIC) do i=1,n do j=1,n do k=1,n C(i,j)=A(i,j)*B(j,k) end do end do end do !$OMP END DO !$OMP END PARALLEL end subroutine Products end module mTEST