Skip to content

Compiling with -O{1,2,3} breaks custom rule that works with -O0 #1994

@cgeoga

Description

@cgeoga

My usual preface: apologies if I have missed relevant docs or existing/past issues on this.

I have some C code that uses a custom forward-mode rule. Compiled with -O0, that rule gets triggered. But if I compile with any higher optimization level, the custom rule does not get triggered (which I investigate using a simple print statement). Here is an MWE:

#include<stdlib.h>
#include<stdio.h>
#include<math.h>

int enzyme_const, enzyme_dup, enzyme_dupnoneed;
double __enzyme_fwddiff(void*, ...);

static void ipabsterm(double* at, double* t){
  *at = fabs(*t);
}

static void derivative_ipabsterm(double* at, double* d_at,
                                 double*  t, double* d_t){
  printf("Hi!\n");
  *at   = fmax(*t, *d_t);
  *d_at = fmax(*t, *d_t);
}

void* __enzyme_register_derivative_ipabsterm[] = {
  (void*)ipabsterm,
  (void*)derivative_ipabsterm,
};

double __attribute__((optnone)) absterm(double t){
  double at;
  ipabsterm(&at, &t);
  return at;
}

double _fma(double x, double y, double z){
  return x*y + z;
}

double horner(double x, double* coefs, int len) {
  double b_p1 = coefs[len-1];
  double b    = 0.0;
  for(int k=len-1; k>0; k--){
    b = coefs[k-1] + x*b_p1;
    b_p1 = b;
  }
  return b;
}

double gamma(double _x) {

  const double sq2pi = sqrt(2*M_PI);

	double x = _x;
	double s;
	if(x < 0) {
		s = sin(M_PI * _x);
		if(s == 0) return NAN;
		x = -x;
		s *= x;
	}
	if(!isfinite(x)) return x;

	if(x > 11.5) {
		double w = 1/x; 
		double coefs[10] = {1.0,
			8.333333333333331800504e-2, 3.472222222230075327854e-3, -2.681327161876304418288e-3, -2.294719747873185405699e-4,
			7.840334842744753003862e-4, 6.989332260623193171870e-5, -5.950237554056330156018e-4, -2.363848809501759061727e-5,
			7.147391378143610789273e-4
		};
		w = horner(w, coefs, 10);
		double muladd = _fma(0.5, x, -0.25);
		double v = pow(x, muladd);
		double res = sq2pi * v * (v / exp(x)) * w;

		if(_x < 0) {
			return M_PI / (res * s);
		} else {
			return res;
		}
	}
	double P[8] = {
		1.000000000000000000009e0, 8.378004301573126728826e-1, 3.629515436640239168939e-1, 1.113062816019361559013e-1,
		2.385363243461108252554e-2, 4.092666828394035500949e-3, 4.542931960608009155600e-4, 4.212760487471622013093e-5
	};
	double Q[9] = {
		9.999999999999999999908e-1, 4.150160950588455434583e-1, -2.243510905670329164562e-1, -4.633887671244534213831e-2,
		2.773706565840072979165e-2, -7.955933682494738320586e-4, -1.237799246653152231188e-3, 2.346584059160635244282e-4,
		-1.397148517476170440917e-5
	};

	double z = 1.0;
	while(x >= 3.0) {
		x -= 1.0;
		z *= x;
	}
	while(x < 2.0) {
		z /= x;
		x += 1.0;
	}

	x -= 2.0;
	double p = horner(x, P, 8);
	double q = horner(x, Q, 9);
	return _x < 0 ? M_PI * q / (s * z * p) : z * p / q;
}

double besselk_power_series(double v, double x) {
  double gam    = gamma(v); 
  double ngam   = M_PI/(sin(-M_PI*fabs(v))*gam*v);
  double s1, s2, t1, t2, at1;
  s1 = 0.0 ; s2 = 0.0 ; t1 = 1.0; t2 = 1.0;
  double xx     = x*x;
  double fourk  = 0.0;
  for(int k=1; k<50; k++) {
    fourk = 4*k;
    s1 += t1;
    s2 += t2;
    t1 *= xx/(fourk*(k-v));
    t2 *= xx/(fourk*(k+v));
    at1 = absterm(t1);
    if (at1 <= 2.220446049250313e-16) break;
  }
  double xpv = pow(x/2, v);
  double s   = gam*s1 + xpv*xpv*ngam*s2;
  return s/(2*xpv);
}

double test(double v, double x) {
	double dv = 1.0;
	double df = __enzyme_fwddiff((void*) besselk_power_series, enzyme_dup, v, dv, enzyme_const, x);
	return df;
}

int main(int argc, char** argv) {
  printf("%1.16e\n", test(1.02, 1.51));
  return 0;
}

Which I compile with

clang mwe.c -fplugin=/usr/lib/ClangEnzyme-18.so -O0 -Rpass=.* -lm -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -Wall -pedantic -o mwe

on linux with clang 18. When the compiled code actually hits the rule, ./mwe will print a handful of Hi! lines before giving the solution.

I have the -Rpass=.* flag to see all the compiler optimizations that get done, and depending on a few small tweaks I sometimes get something like

libmatern.c:[...]: remark: Cannot use provided custom derivative pass [-Rpass=enzyme]

for ipabsterm. But I'm really having trouble figuring out what compiler optimization is breaking things. As you can see with the attribute I've put for that function, I was suspicious that the function was getting inlined and the loop vectorized, which maybe was breaking things. But nothing I have tried has fixed it.

Any thoughts or suggestions you have would be greatly appreciated! Thanks so much in advance.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions