OpenMP: How to mix nested and non-nested parallization?

I have built this simple little program in an attempt to succesfully switch between nested and non-nested parallelism. This program is designed for a NUMA system with at least 6 numanodes, with 4 cores each, but if you have fewer NUMAnodes just change the number in the beginning...

So far I have been unable to figure out how to correctly combine Nested and non-nested parallelism.

Just a quick explanation of what this program does:

It sets the affinity for nested parallelism
Then it runs a nested parallel region (6 numa nodes with 4 cores on master thread and 3 on the others). (Here the load is distributed as desired across the NUMA nodes)
Then it runs a non-nested parallel region with 6*4 threads scattered across the NUMAnodes. (Here the distribution also works as intended)
Then it runs a second nested parallel region exactly similar to the first nested parallel region. (Now the load distribution is all fucked up, and all the work ends up on 2 of the NUMA nodes while the rest does nothing.)
So does anyone know how to switch between these different types of parallel regions? Is what I'm seeing a bug or what exactly is happening here?

program NumaAwareDGEMM
 use IFPORT
 use omp_lib
 use mkl_service
 use mTEST
implicit none

 logical(4) :: Success
 integer :: NoNUMANodes, blocksize,nrepeats,Runmode,t0
 integer :: N,I,J,NIte, First,Last,k,colidx,error,numofblocks,iii,ii,dim,d,threadID,NumaID
 integer :: Iter,Solver,NUMASize,m,ThreadsPrNuma,ID, NCPU
 integer, allocatable,dimension(:) :: GlobalThreadID
 real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8
 real*8,allocatable,dimension(:,:,:) :: C
 real*8,allocatable,dimension(:)    :: tmp
 logical, allocatable, dimension(:) :: NumaNodeDone,MKlbusy


 NoNUMANodes=6                     !How many NUMA nodes to distribute calculations over
 NCPU=6*4
success = SETENVQQ("OMP_DISPLAY_ENV=TRUE")
success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6},{36:6},{42:6}")
!success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}")
!success=SETENVQQ("OMP_PLACES={0:8},{8:8},{16:8},{24:8},{32:8},{40:8}")

 blocksize=600
 dim=blocksize*NoNUMANodes
 allocate(A(dim,dim))
 allocate(B(dim,dim))
 allocate(C1(dim,dim))
 allocate(C2(dim,dim))
 allocate(C3(dim,dim))
 allocate(C4(dim,dim))
 allocate(C5(dim,dim))
 allocate(C6(dim,dim))
 allocate(C7(dim,dim))
 allocate(C8(dim,dim))
 allocate(tmp(NCPU))
 call KMP_SET_STACKSIZE_S(990000000)
 call omp_set_dynamic(0)
 call omp_set_nested(1)
   !intialization region
   call omp_set_num_threads(NoNUMANodes) !First we spawn all the threads in a threadpool
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNUMANodes
      ID=omp_get_thread_num()
      print *,'Thread binding for socket=',ID
      if(i-1.ne.ID) print*,'ERROR on ID',ID,'i=',i
      SELECT CASE (i)
        CASE(1)
!          success=SETENVQQ("OMP_PLACES={0:8}")
          success=SETENVQQ("OMP_PLACES={0:6}")
        CASE(2)
!          success=SETENVQQ("OMP_PLACES={8:8}")
          success=SETENVQQ("OMP_PLACES={6:6}")
        CASE(3)
!          success=SETENVQQ("OMP_PLACES={16:8}")
          success=SETENVQQ("OMP_PLACES={12:6}")
        CASE(4)
          success=SETENVQQ("OMP_PLACES={18:6}")
!          success=SETENVQQ("OMP_PLACES={24:8}")
        CASE(5)
          success=SETENVQQ("OMP_PLACES={24:6}")
!          success=SETENVQQ("OMP_PLACES={32:8}")
        CASE(6)
          success=SETENVQQ("OMP_PLACES={30:6}")
!          success=SETENVQQ("OMP_PLACES={40:8}")
        CASE(7)
          success=SETENVQQ("OMP_PLACES={36:6}")
!          success=SETENVQQ("OMP_PLACES={48:8}")
        CASE(8)
          success=SETENVQQ("OMP_PLACES={42:6}")
 !         success=SETENVQQ("OMP_PLACES={56:8}")
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL
    print*,'Initialization over'
   !
    call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      ID=omp_get_thread_num()
      SELECT CASE (i)
        CASE(1)
          call Products(dim,A,B,C1)
        CASE(2)
          call Products(dim,A,B,C2)
        CASE(3)
          call Products(dim,A,B,C3)
        CASE(4)
          call Products(dim,A,B,C4)
        CASE(5)
          call Products(dim,A,B,C5)
        CASE(6)
          call Products(dim,A,B,C6)
        CASE(7)
          call Products(dim,A,B,C7)
        CASE(8)
          call Products(dim,A,B,C8)
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL
   print*,'First Nested done '

    print*,'Starting single parallel region'
   call omp_set_num_threads(NCPU)
    print*,'Proc_bind',omp_get_proc_bind()
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,k)  proc_bind(Spread)
   !$OMP DO SCHEDULE(STATIC)
   do i=1,NCPU
    k=0
    do j=1,1000000000
     k=k+exp((i*1d0))*exp(-(i*1d0))+(j**2)
    end do
    tmp(i)=k
   end do
   !$OMP END DO
   !$OMP END PARALLEL

   print*,'Single parallel region done'


    call omp_set_num_threads(NoNUMANodes) !Now outer parallelization over numa nodes
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) proc_bind(Spread)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call Products(dim,A,B,C1)
        CASE(2)
          call Products(dim,A,B,C2)
        CASE(3)
          call Products(dim,A,B,C3)
        CASE(4)
          call Products(dim,A,B,C4)
        CASE(5)
          call Products(dim,A,B,C5)
        CASE(6)
          call Products(dim,A,B,C6)
        CASE(7)
          call Products(dim,A,B,C7)
        CASE(8)
          call Products(dim,A,B,C8)
      END SELECT
   end do
   !$OMP END DO
   !$OMP END PARALLEL

    end program NumaAwareDGEMM

  module mTEST
   use omp_lib

    contains
    subroutine Products(n,A,B,C)
    implicit none
    real*8,dimension(:,:)  :: A,B,C
    integer :: n
    integer  :: i,j,k,ID

    ID=omp_get_thread_num()
    if (ID.eq.0) then
    call omp_set_num_threads(4) !Inner parallelization
    else
    call omp_set_num_threads(3) !Inner parallelization
    end if

   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) PROC_BIND(MASTER)
   !$OMP DO SCHEDULE(STATIC)
    do i=1,n
     do j=1,n
      do k=1,n
        C(i,j)=A(i,j)*B(j,k)
      end do
     end do
    end do
   !$OMP END DO
   !$OMP END PARALLEL

    end subroutine Products
end module mTEST