BernoulliSubstring.java.txt

import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.commons.math3.distribution.PoissonDistribution;
import org.apache.commons.math3.distribution.NormalDistribution;

/**
 * A substring that can compute the exact expectation and variance of the number of its
 * occurrences in a string generated by a given IID source, as well as scores of 
 * statistical surprise, using information from its suffixes. See 
 * \cite{Efficient_detection_of_unusual_words} for notation and algorithmics, and
 * \cite{Monotony_of_surprise_and_large-scale_quest_for_unusual_words} for statistics.
 *
 * Remark: we don't use Apache Commons' $FastMath$ routines for elementary math 
 * operations, because $FastMath$ is never faster than $Math$ in experiments (likely due 
 * to internalization). We use $StrictMath.scalb$ rather than $Math.scalb$ because the 
 * former is approximately twice as fast as the latter. See comments in 
 * $CompareFastMath.java$.
 */
public class BernoulliSubstring extends BorderSubstring {
	/**
	 * Scores are defined in method $getScore$
	 */
	private static final byte SCORE_ID = 8;
	
	/**
	 * TRUE=use a tighter upper bound for the Poisson error in method
	 * $getCumulativeProbability$.
	 */
	private static final boolean TIGHT_POISSON_ERROR = true;
	
	public double pHatLog, capitalT, capitalB;
	public double expectation, variance;
	
	
	public BernoulliSubstring(int alphabetLength, int log2alphabetLength, int textLength, int log2textLength) {
		super(alphabetLength,log2alphabetLength,textLength,log2textLength);
	}
	
	
	public static int serializedSize(int alphabetLength, int log2alphabetLength) {
		return BorderSubstring.serializedSize(alphabetLength,log2alphabetLength)+64*(SCORE_ID<=7?1:3);
	}
	
	
	public void serialize(Stack stack) {
		super.serialize(stack);
		stack.push(Double.doubleToLongBits(pHatLog),64);
		if (SCORE_ID<=7) return;
		stack.push(Double.doubleToLongBits(capitalT),64);
		stack.push(Double.doubleToLongBits(capitalB),64);
	}
	
	
	public void deserialize(Stack stack) {
		super.deserialize(stack);
		pHatLog=Double.longBitsToDouble(stack.read(64));
		if (SCORE_ID<=7) return;
		capitalT=Double.longBitsToDouble(stack.read(64));
		capitalB=Double.longBitsToDouble(stack.read(64));
	}
	
	
	public int skip(Stack stack) {
		final int log2address = super.skip(stack);
		stack.skip(64);
		if (SCORE_ID>7) { stack.skip(64); stack.skip(64); }
		return log2address;
	}
	
	
	/**
	 * Remark: the recursive computations in this method hold only for 
	 * $length<=HALF_TEXT_LENGTH$.
	 *
	 * @param logProbA $\log(\mathbb{P}[a])$;
	 * @param buffer1 scratch space; at the end of the procedure it contains $bord(v)$;
	 * @param buffer2 scratch space; at the end of the procedure it contains 
	 * $bord(bord(v))$.
	 */
	public final void initFromSuffix(long a, double logProbA, BernoulliSubstring suffix, Stack stack, Stack characterStack, BernoulliSubstring buffer1, BernoulliSubstring buffer2) {	
		final boolean hasBorder;
		final long maxOccurrences;
		final double pHat, longestBorderPHatLog, longestBorderLongestBorderPHatLog, longestBorderCapitalB, longestBorderCapitalT, ratio;
		super.initFromSuffix(a,suffix,stack,characterStack,buffer1);
		
		// Expectation and first term of the variance
		maxOccurrences=TEXT_LENGTH-length+1;
		pHatLog=suffix.pHatLog+logProbA;
		pHat=Math.exp(pHatLog);
		expectation=maxOccurrences*pHat;
		if (SCORE_ID<=6) return;
		variance=expectation*(1-pHat);
		if (SCORE_ID==7) return;
		
		// Second and third terms of the variance
		final double pHatSquare = pHat*pHat;
		variance-=pHatSquare*((TEXT_LENGTH<<1)-3*length+2)*(length-1);
		if (rightLength==0) {
			capitalT=0; capitalB=0;
			return;
		}
		longestBorderPHatLog=buffer1.pHatLog;  // $buffer1$ contains $bord(v)$ at this point
		longestBorderCapitalB=buffer1.capitalB;
		longestBorderCapitalT=buffer1.capitalT;
		ratio=Math.exp(pHatLog-longestBorderPHatLog);
		hasBorder=buffer1.getLongestBorder(stack,buffer2);
		if (!hasBorder) capitalT=0;
		else {
			longestBorderLongestBorderPHatLog=buffer2.pHatLog;
			capitalT=longestBorderCapitalT*ratio +
					 Math.exp(pHatLog-longestBorderLongestBorderPHatLog);
		}
		capitalB=(TEXT_LENGTH-(length<<1)+1+longestBorderLength)*ratio +
				 ((longestBorderLength-length)<<1)*capitalT + 
				 ratio*longestBorderCapitalB;
		variance+=StrictMath.scalb(pHat,1)*capitalB;
	}


	// --------------------------- SCORING PROCEDURES ------------------------------------
	/**
	 * Determines which nodes of the suffix tree of the text to explore in order to
	 * detect surprising substrings of a given type. When the score is non-monotonic 
	 * inside a left-equivalence class, or when it is not a convex function of a monotonic
	 * score, or when it is not a convex and increasing function of a convex function of a
	 * monotonic score, we cannot limit the search to nodes of the suffix tree and to 
	 * their one-character extensions, thus the whole approach breaks down.
	 *
	 * @param stringType 0: over-represented; 1: under-represented; 2: both;
	 * @param maxProbability $\max\{\mathbb{P}(a) : a \in \Sigma\}$;
	 * @return 0: suffix-tree nodes; 1: one-character extensions of suffix-tree nodes; 
	 * 2: both; 3: don't explore any node, i.e. the whole approach fails.
	 */
	public static final byte nodesToExplore(byte stringType, double maxProbability) {
		if (SCORE_ID<=5) return stringType;
		if (SCORE_ID==6) return 2;
		if (SCORE_ID==7) {
			if (maxProbability<0.5) return stringType;
			else return 3;
		}
		if (SCORE_ID==8) {
			// These three upper bounds refer to function $f(m)=1/((4m)^{1/m})$ evaluated at 
			// $m=1,2$, and to $\sqrt{2}-1$. Just these two values of $m$ need to be 
			// considered since $f(m)>\sqrt{2}-1$ at $m \geq 3$.
			if (maxProbability<Math.min(Math.min(Math.sqrt(2)-1,1d/Math.sqrt(8)),0.25)) return stringType;
			else return 3;
		}
		if (SCORE_ID==9) {
			// These three upper bounds refer to function $f(m)=1/((4m)^{1/m})$ evaluated at 
			// $m=1,2$, and to $\sqrt{2}-1$. Just these two values of $m$ need to be 
			// considered since $f(m)>\sqrt{2}-1$ at $m \geq 3$.
			if (maxProbability<Math.min(Math.min(Math.sqrt(2)-1,1d/Math.sqrt(8)),0.25)) return 2;
			else return 3;
		}
		return 3;
	}
	
	
	public final double getScore(double frequency) {
		switch (SCORE_ID) {
			// Expectation only
			case 1: return frequency-expectation;
			case 2: return frequency/expectation;
			case 3: return (frequency-expectation)/expectation;
			case 4: return (frequency-expectation)/Math.sqrt(expectation);
			case 5: return Math.abs(frequency-expectation)/Math.sqrt(expectation);
			case 6: return (frequency-expectation)*(frequency-expectation)/expectation;
			case 7: return (frequency-expectation)/Math.sqrt(variance);  // Assumes that $variance$ is the approximation $expectation*(1-pHat)$
			// Expectation and variance
			case 8: return (frequency-expectation)/Math.sqrt(variance);
			case 9: return Math.abs(frequency-expectation)/Math.sqrt(variance);
		}
		return 0;
	}

	
	/**
	 * Computes the probability that the number of occurrences $occ$ of $v$ in an IID text 
	 * of length $TEXT_LENGTH$ is less than or equal to $observedOccurrences$.
	 *
	 * Remark: this method uses some potentially slow procedures in the Apache Commons 
	 * Math library to compute cumulative probabilities for normal and Poisson 
	 * distribution.
	 *
	 * @return $out[0]=\mathbb{P}[occ \leq observedOccurrences]$; $out[1]$: upper bound on 
	 * the absolute error in cell 0 when the Poisson distribution is used to model $occ$.
	 */
	public final void getCumulativeProbability(int observedOccurrences, double[] out) {
		final int SIGNIFICANTLY_GREATER_THAN = 100;
		final double pHat, pHatSquare, shortestPeriod;
		final double probability;
		double poissonError = 0;
		
		shortestPeriod=length-longestBorderLength;
		pHat=Math.exp(pHatLog);
		pHatSquare=pHat*pHat;
		if (shortestPeriod/length>SIGNIFICANTLY_GREATER_THAN*ONE_OVER_LOG_TEXT_LENGTH && TEXT_LENGTH>SIGNIFICANTLY_GREATER_THAN*length) {
			// Using Poisson distribution with mean equal to $expectation$.
			// b1=pHatSquare*(-3*length*length+(length<<2)+((length*TEXT_LENGTH)<<1)-TEXT_LENGTH-1);
			// b2=b1+variance-expectation;
			poissonError=variance-expectation+StrictMath.scalb(pHatSquare,1)*(-3*length*length+(length<<2)+((length*TEXT_LENGTH)<<1)-TEXT_LENGTH-1);
			if (TIGHT_POISSON_ERROR) poissonError*=-Math.expm1(0d-expectation)/expectation;
			probability=(new PoissonDistribution(expectation)).cumulativeProbability(observedOccurrences);
		}
		else probability=(new NormalDistribution(expectation,Math.sqrt(variance))).cumulativeProbability(observedOccurrences);
		out[0]=probability; out[1]=poissonError;
	}	
	
	
	// ----------------------------------- TESTING ---------------------------------------
	public static void main(String[] args) {
		testCorrectness();
	}
	
	
	private static final void testCorrectness() {
		final int TEXT_LENGTH = 10000;
		final int N_ITERATIONS = 10;
		final int STRINGS_PER_REGION = 30;
		boolean longestBorder, isBorder;
		int i, j, t, length, nBorders;
		long[] borders = new long[4];
		long a;
		double p;
		BernoulliSubstring top, suffix, buffer1, buffer2, tmp;
		
		for (t=0; t<N_ITERATIONS; t++) {
			String text = "";
			for (i=0; i<TEXT_LENGTH; i++) {
				p=Math.random();
				if (p<0.25) text+='0';
				else if (p<0.5) text+='1';
				else if (p<0.75) text+='2';
				else text+='3';
			}
			
			// Testing
			Stack stack = new Stack((BernoulliSubstring.serializedSize(4,2)*STRINGS_PER_REGION)>>>6);
			Stack characterStack = new Stack(STRINGS_PER_REGION);
			top = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
			suffix = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
			buffer1 = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
			buffer2 = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
			top.serialize(stack); 
			//top.print();
			for (i=text.length()-1; i>=0; i--) {  // Inserting $test$ from right to left
				if (text.charAt(i)=='0') a=0L;
				else if (text.charAt(i)=='1') a=1L;
				else if (text.charAt(i)=='2') a=2L;
				else a=3L;
				characterStack.push(a,2);
				
				// Computing borders using brute force
				for (j=0; j<4; j++) borders[j]=-1;
				for (length=1; length<text.length()-i; length++) {
					isBorder=true;
					for (j=0; j<length; j++) {
						if (text.charAt(i+j)!=text.charAt(text.length()-length+j)) {
							isBorder=false;
							break;
						}
					}
					if (isBorder) {
						if (text.charAt(text.length()-length-1)=='0') borders[0]=length;
						else if (text.charAt(text.length()-length-1)=='1') borders[1]=length;
						else if (text.charAt(text.length()-length-1)=='2') borders[2]=length;
						else borders[3]=length;
					}
				}
			
				// Computing borders using recursion
				tmp=suffix; suffix=top; top=tmp;
				top.initFromSuffix(a,-1.386294361119891,suffix,stack,characterStack,buffer1,buffer2);  // Overwrites all variables in $top$
				top.serialize(stack);  // Overwrites $address$
				//top.print();
				
				// Checking correctness
				nBorders=0;
				for (j=0; j<4; j++) {
					if (borders[j]!=-1) nBorders++;
				}
				if (top.rightLength!=nBorders) {
					System.err.println("ERROR at suffix <"+text.substring(i)+">:");
					System.err.println("rightLength="+top.rightLength+" (should be "+nBorders+")");
					System.exit(1);
				}
				for (j=0; j<top.rightLength; j++) {
					if (borders[(int)top.rightCharacters[j]]==-1) {
						System.err.println("ERROR at suffix <"+text.substring(i)+">:");
						System.err.println("character "+top.rightCharacters[j]+" should not have a border");
						System.exit(1);
					}
					stack.setPosition(top.rightPointers[j]);
					buffer1.deserialize(stack);
					if (buffer1.length!=borders[(int)top.rightCharacters[j]]) {
						System.err.println("ERROR at suffix <"+text.substring(i)+">:");
						System.err.println("character "+top.rightCharacters[j]+" should have a border of length "+borders[(int)top.rightCharacters[j]]+" rather than "+buffer1.length);
						System.exit(1);
					}
				}
			}
		}
		System.out.println("No errors found");
	}
	
	
	public void print() {
		super.print();
		System.out.println("pHatLog="+pHatLog);
		System.out.println("capitalT="+capitalT);
		System.out.println("capitalB="+capitalB);
	}
	
}