Bizarre results obtained with a simple program

I have some experience with gfortran compiler but no experience whatsoever with ifort, so I should be doing something terribly wrong in what follows. I have recently installed parallel_studio_xe_2016_update1 in my laptop running Debian (Jessie release). I have prepared a very simple program with a very naive approach to computing Pi using Leibniz series. Probably is one of the worst ways of computing Pi, but I just wanted to check vectorization and the stability of the program. The program has a version for single precision and another version for double precision.

The single precision is the following:

PROGRAM VECT_TEST_SINGLE
  IMPLICIT NONE
  INTEGER, PARAMETER :: SP = KIND(1.0)
  INTEGER, PARAMETER :: DP = KIND(1.0D0)
  !
  REAL(KIND=SP) :: Pi_app
  REAL(KIND=DP) :: time_start, time_end, time_tot
  INTEGER :: I,J
  !
  time_tot = 0.0_dp
  Pi_app = 0.0_SP
  !
  DO J = 0, 10
     Pi_app = 0.0_SP
     !
     CALL CPU_TIME(time_start)
     !
     DO I = 0, (2**J)*100000
        Pi_app = Pi_app + (-1.0_SP)**I/REAL(2*I+1,SP)
     END DO
     !
     CALL CPU_TIME(time_end)
     !
     time_tot = time_tot + time_end-time_start
     !
     PRINT*, (2**J)*100000, time_tot, 4.0_SP*Pi_app, 4.0_SP*Pi_app-ACOS(-1.0_SP)
     !
  ENDDO
  !
END PROGRAM VECT_TEST_SINGLE

While for double precision the program has minor variations.

PROGRAM VECT_TEST_DOUBLE
  IMPLICIT NONE
  INTEGER, PARAMETER :: DP = KIND(1.0D0)
  !
  REAL(KIND=DP) :: Pi_app
  REAL(KIND=DP) :: time_start, time_end, time_tot
  INTEGER :: I,J
  !
  time_tot = 0.0_dp
  Pi_app = 0.0_DP
  !
  DO J = 0, 10
     Pi_app = 0.0_DP
     CALL CPU_TIME(time_start)
     !
     DO I = 0, (2**J)*100000
        Pi_app = Pi_app + (-1.0_DP)**I/REAL(2*I+1,DP)
     END DO
     !
     CALL CPU_TIME(time_end)
     !
     time_tot = time_tot + time_end-time_start
     !
     PRINT*, (2**J)*100000, time_tot, 4.0_DP*Pi_app, 4.0_DP*Pi_app-ACOS(-1.0_DP)
     !
  ENDDO
  !
END PROGRAM VECT_TEST_DOUBLE

In both cases the output lines is the numbers of terms summed of the series, the time taken to do the sum and the approx. value of Pi and the difference with ACOS(-1). I compiled the programs with the following Makefile for gfortran:

#
OPT=-O3
FC = gfortran
FLAGS = -march=native -fopt-info-vec-missed -fopt-info-vec-optimized


all: test_vectorize_Single test_vectorize_Double

.FORCE:

test_vectorize_Single:	test_vectorize_Single.f90 Makefile .FORCE
	$(FC) $(OPT) $(FLAGS) -o $@_$(FC) $<

test_vectorize_Double:	test_vectorize_Double.f90 Makefile .FORCE
	$(FC) $(OPT) $(FLAGS)  -o $@_$(FC) $<

clean:
	rm -f *.o *.s *.exe *.lst *.ppm  test_vectorize_Single_$(FC) test_vectorize_Double_$(FC)

and a different Makefile for ifort:

#
OPT=-fast
INFO=-qopt-report=3
FLAGS = -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm
FC = ifort

all: test_vectorize_Single test_vectorize_Double

.FORCE:

test_vectorize_Single:	test_vectorize_Single.f90 Makefile_ifort .FORCE
	$(FC) $(OPT)  $(INFO) -o $@_$(FC) $< $(FLAGS)

test_vectorize_Double:	test_vectorize_Double.f90 Makefile_ifort .FORCE
	$(FC) $(OPT) $(INFO) -o $@_$(FC) $< $(FLAGS)


clean:
	rm -f *.o *.s *.exe *.lst *.ppm   test_vectorize_Single_$(FC) test_vectorize_Double_$(FC)

I find the results surprising, specially for single precision:

$ time ./test_vectorize_Single_gfortran
      100000   0.0000000000000000        3.14160585       1.31130219E-05
      200000   0.0000000000000000        3.14160132       8.58306885E-06
      400000   4.0000000000000001E-003   3.14159846       5.72204590E-06
      800000   8.0000000000000002E-003   3.14159727       4.52995300E-06
     1600000   2.0000000000000000E-002   3.14159703       4.29153442E-06
     3200000   3.2000000000000001E-002   3.14159703       4.29153442E-06
     6400000   5.1999999999999991E-002   3.14159703       4.29153442E-06
    12800000   8.3999999999999991E-002   3.14159703       4.29153442E-06
    25600000  0.14799999999999999        3.14159679       4.05311584E-06
    51200000  0.26799999999999990        3.14159679       4.05311584E-06
   102400000  0.50400000000000000        3.14159679       4.05311584E-06

real	0m0.506s
user	0m0.504s
sys	0m0.000s

$ time ./test_vectorize_Single_ifort
      100000  8.000000000000000E-003   3.141608      1.5497208E-05
      200000  2.800000000000000E-002   3.141596      3.5762787E-06
      400000  5.200000000000000E-002   3.141694      1.0156631E-04
      800000  8.799999999999999E-002   3.141549     -4.4107437E-05
     1600000  0.160000000000000        3.142432      8.3971024E-04
     3200000  0.304000000000000        3.140750     -8.4304810E-04
     6400000  0.604000000000000        3.147671      6.0782433E-03
    12800000   1.28000000000000        3.121287     -2.0306110E-02
    25600000   2.56000000000000        3.064596     -7.6996565E-02
    51200000   5.20000000000000        3.031954     -0.1096389
   102400000   10.5840000000000        3.031954     -0.1096389

real	0m10.582s
user	0m10.580s
sys	0m0.004s

The ifort program took way much longer to execute and the results were completely off the mark compared to the previous case. In the double precision case results are quite similar though still ifort compiled executable took much longer than the alternative:

$ time ./test_vectorize_Double_gfortran
      100000   0.0000000000000000        3.1416026534897203        9.9998999272266076E-006
      200000   4.0000000000000001E-003   3.1415976535647618        4.9999749687223982E-006
      400000   8.0000000000000002E-003   3.1415951535834941        2.4999937009440032E-006
      800000   1.6000000000000000E-002   3.1415939035881548        1.2499983617075827E-006
     1600000   2.7999999999999997E-002   3.1415932785894536        6.2499966047013800E-007
     3200000   5.1999999999999991E-002   3.1415929660895618        3.1249976872871343E-007
     6400000   8.0000000000000016E-002   3.1415928098397976        1.5625000449048798E-007
    12800000  0.14000000000000001        3.1415927317148120        7.8125018898589360E-008
    25600000  0.25200000000000000        3.1415926926522508        3.9062457712901733E-008
    51200000  0.47999999999999998        3.1415926731216950        1.9531901873648394E-008
   102400000  0.93599999999999994        3.1415926633549081        9.7651149388866543E-009

real	0m0.938s
user	0m0.936s
sys	0m0.000s
[pts/0][curro.kimoshi: src]$ time ./test_vectorize_Double_ifort
      100000  1.200000000000000E-002   3.14160265348968       9.999899887258579E-006
      200000  2.800000000000000E-002   3.14159765356467       4.999974872355040E-006
      400000  5.599999999999999E-002   3.14159515358340       2.499993601912109E-006
      800000  0.108000000000000        3.14159390358802       1.249998222263571E-006
     1600000  0.192000000000000        3.14159327858927       6.249994810580972E-007
     3200000  0.364000000000000        3.14159296608931       3.124995213710235E-007
     6400000  0.720000000000000        3.14159280983955       1.562497522478168E-007
    12800000   1.44800000000000        3.14159273171429       7.812449842603542E-008
    25600000   2.93600000000000        3.14159269265213       3.906233603245823E-008
    51200000   5.98000000000000        3.14159267312262       1.953282380284804E-008
   102400000   12.2040000000000        3.14159266335631       9.766513375808472E-009

real	0m12.203s
user	0m12.208s
sys	0m0.000s

I am probably doing something wrong or using wrong options, any explanation of what's going on, suggestion, or remark will be very welcomed.