Skip to content

Commit a85b886

Browse files
Serguei PatchkovskiiSerguei Patchkovskii
Serguei Patchkovskii
authored and
Serguei Patchkovskii
committed
Reduce stack usage for better compatibility with macOS
General cleanup of the code, to reduce array temporaries creation by gfortran 12.2. These changes should lead to lower stack usage, particularly important on macOS and other systems with low stack limits. The effect on performance on other systems should be neutral, or even positive. Additionally, all unsed wrapper routines were removed from lapack.f90 glue interface to LAPACK. The remaining routines (diagonalization of general complex matrices) were rewritten to use the recommended call sequence for allocating temporary storage. There are still several crashes on macOS in the test set, but the present state is a marked improvement.
1 parent 1433809 commit a85b886

19 files changed

+284
-1246
lines changed

Makefile

+2-10
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ ACT2 = -e 's/^!\*nm/ /' # Disable MPI statements
1919
#
2020
# System-specific overrides
2121
#
22-
include vanilla.mak
22+
# include vanilla.mak
2323
# include configs/babel-gfortran_opt.mak
2424
# include configs/babel-ifort18_opt.mak
2525
# include configs/zen-gfortran-7_opt.mak
2626
# include configs/zen-gfortran-11_opt.mak
27-
# include configs/zen-gfortran-12_opt.mak
27+
include configs/zen-gfortran-12_opt.mak
2828
# include configs/zen-oneapi_opt.mak
2929
# include configs/zen-oneapi_opt_mpi.mak
3030
# include configs/zen-aocc-1.1_opt.mak # VERY SLOW CODE. DO NOT USE.
@@ -59,12 +59,6 @@ LIBS = $(LAPACK) $(LAPACK) $(LIBEXTRA)
5959
$(ACT) $(ACT2) $< >preprocess/$<
6060
$(F90) -c preprocess/$<
6161

62-
dgefa.o: dgefa.f
63-
$(F90) -c dgefa.f
64-
65-
dgedi.o: dgedi.f
66-
$(F90) -c dgedi.f
67-
6862
#hacks.o: hacks.f90 accuracy.o
6963
# $(F90) -O0 -c hacks.f90
7064

@@ -86,8 +80,6 @@ LIBSPHERICAL += composition_analysis.o
8680
LIBSPHERICAL += constants.o
8781
LIBSPHERICAL += coulomb_functions.o
8882
LIBSPHERICAL += cubic_spline.o
89-
LIBSPHERICAL += dgefa.o
90-
LIBSPHERICAL += dgedi.o
9183
LIBSPHERICAL += hacks.o
9284
LIBSPHERICAL += lapack.o
9385
LIBSPHERICAL += math.o

checkpoint_tools.f90

+4-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ module checkpoint_tools
6161
integer(ik), save :: ckpt_max_checkpoints = 3_ik ! Maximum number of checkpoints to keep.
6262
integer(ik), save :: ckpt_interval = 10000_ik ! Number of timesteps between checkpoints.
6363
!
64-
character(len=clen), save :: rcsid_checkpoint_tools = "$Id: checkpoint_tools.f90,v 1.16 2021/04/26 15:44:44 ps Exp ps $"
64+
character(len=clen), save :: rcsid_checkpoint_tools = "$Id: checkpoint_tools.f90,v 1.17 2022/10/08 17:24:26 ps Exp ps $"
6565
!
6666
type ckpt_data
6767
private ! This data is not for exterrrnal consumption
@@ -243,6 +243,9 @@ subroutine ckpt_load_checkpoint(ck,wfn_l,wfn_r,tsurf)
243243
action = 'verifying header'
244244
if (tag/='SCID CKPT' .or. version/=ckpt_version .or. any(options.neqv.options2)) then
245245
write (out,"('Checkpoint header mismatch')")
246+
! gfortran introduces an array temporary for the transfer() below. There
247+
! does not seem to be anything we could do about it: adding an explicit
248+
! temporary still creates a temporary array ...
246249
write (out,"('Expected: ',a,' V.',i0,' options = ',2l1,'(',2z8,')')") &
247250
'SCID CKPT', ckpt_version, options, transfer(options,1_ik,2)
248251
write (out,"('Received: ',a,' V.',i0,' options = ',2l1,'(',2z8,')')") &

composition_analysis.f90

+3-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ module composition_analysis
3737
public ca_maxram
3838
public rcsid_composition_analysis
3939
!
40-
character(len=clen) :: rcsid_composition_analysis = "$Id: composition_analysis.f90,v 1.17 2021/04/26 15:44:44 ps Exp ps $"
40+
character(len=clen) :: rcsid_composition_analysis = "$Id: composition_analysis.f90,v 1.18 2022/10/08 17:24:26 ps Exp ps $"
4141
!
4242
real(rk), save :: ca_maxram = 0._rk ! Maximum amount of memory which can be used during the analysis step
4343
! This limit does NOT include the memory needed to compute atomic
@@ -167,6 +167,8 @@ subroutine ca_analyze(verbose,threshold,wfn_l,wfn_r,tsurf)
167167
call wt_atomic_solutions(verbose,lval,block_eval,block_evec)
168168
en(:,lval) = block_eval(:)
169169
scan_spin: do ispin=1,sd_nspin
170+
! gfortran is having trouble generating good code for this array assignment
171+
! there does not seem to be anything we could do about it though ...
170172
amp(:,1,ispin,mmin:mmax,lval) = matmul(transpose(block_evec(:,:,1)),wfn_r%wfn(:,ispin,lval,mmin:mmax))
171173
amp(:,2,ispin,mmin:mmax,lval) = matmul(transpose(block_evec(:,:,2)),wfn_l%wfn(:,ispin,lval,mmin:mmax))
172174
end do scan_spin

configs/macos_m1-gfortran_opt.mak

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
BUILD_ID :="Optimized gfortran (-O1), built on $(shell hostname) at $(shell date)"
2-
ACT = sed -e 's/^!\*nq/ /' # Enable quad-math statements
3-
# WARNING: gfortran 11.2.0 appears to have many bugs in OpenMP code generation, especially at -O3
4-
# WARNING: Using -O1 reduces (but does not completely eliminate) these bugs.
5-
# WARNING: Please make sure that the ENTIRE test suite runs correctly before using in production.
1+
BUILD_ID :="Optimized gfortran, built on $(shell hostname) at $(shell date)"
2+
ACT = sed -e 's/^!\*nq/ /' # Disable quad-math statements
3+
# -Ofast is known to produce code which crashes on M1, at least with gfortran 12.2
64
F90 = gfortran -I. \
7-
-O1 -march=native -mtune=native -fopenmp \
8-
-floop-block \
5+
-flto -O1 -fprotect-parens -march=native -mtune=native -fopenmp \
96
-ffast-math -fcx-fortran-rules \
7+
-floop-block \
108
-fno-realloc-lhs -fbacktrace -g \
119
-cpp -D__BUILD_ID__='$(BUILD_ID)' -ffree-line-length-none
1210
F90L = $(F90)
13-
LAPACK = -L/opt/homebrew/opt/openblas/lib -lopenblas
11+
#
12+
# Both openblas and Accelerate are known to work, with openblas being somewhat faster
13+
#
14+
#LAPACK = -L/opt/homebrew/opt/openblas/lib -lopenblas
15+
LAPACK = -framework Accelerate
1416
LIBEXTRA =

coulomb_functions.f90

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ module coulomb_functions
7272
public coulombFG, coulombF, coulombBound
7373
public rcsid_coulomb_functions
7474
!
75-
character(len=clen), save :: rcsid_coulomb_functions = "$Id: coulomb_functions.f90,v 1.13 2021/04/26 15:44:44 ps Exp ps $"
75+
character(len=clen), save :: rcsid_coulomb_functions = "$Id: coulomb_functions.f90,v 1.14 2022/10/08 17:24:26 ps Exp ps $"
7676
!
7777
! integer, parameter :: out = 6
7878
! integer, parameter :: ik = selected_int_kind(15)
@@ -142,7 +142,7 @@ subroutine coulombFG(lambdamin,nlambda,x,eta,fg,skip_g)
142142
!
143143
if (x<=0) stop 'coulomb_functions%coulombFG - bad x'
144144
if (nlambda<=0) stop 'coulomb_functions%coulombFG - bad nlambda'
145-
if (any(ubound(fg)/=(/nlambda,4_ik/))) stop 'coulomb_functions%coulombFG - bad array sizes'
145+
if (ubound(fg,1)/=nlambda .or. ubound(fg,2)/=4_ik) stop 'coulomb_functions%coulombFG - bad array sizes'
146146
!
147147
! Start by filling coefficient tables: we'll need these repeatedly.
148148
!

dgedi.f

-129
This file was deleted.

dgefa.f

-111
This file was deleted.

0 commit comments

Comments
 (0)