Merge remote-tracking branch 'asf/branch_4x' into helio

Heliosearch · Jun 19, 2014 · 825ac1a · 825ac1a
2 parents 39a82e7 + df4d8f8
commit 825ac1a
Show file tree

Hide file tree

Showing 77 changed files with 1,509 additions and 823 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -142,6 +142,8 @@ API Changes
 * LUCENE-5761: Removed DiskDocValuesFormat, it was very inefficient and saved very little
   RAM over the default codec. (Robert Muir)
 
+* LUCENE-5775: Deprecate JaspellLookup. (Mike McCandless)
+
 Optimizations
 
 * LUCENE-5603: hunspell stemmer more efficiently strips prefixes
@@ -246,7 +248,7 @@ Bug fixes
 * LUCENE-5747: Project-specific settings for the eclipse development
   environment will prevent automatic code reformatting. (Shawn Heisey)
 
-* LUCENE-5768: Hunspell condition checks containing character classes
+* LUCENE-5768, LUCENE-5777: Hunspell condition checks containing character classes
   were buggy. (Clinton Gormley, Robert Muir)
 
 Test Framework

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -356,6 +356,25 @@ private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IO
     }
     return builder.finish();
   }
+
+  static String escapeDash(String re) {
+    // we have to be careful, even though dash doesn't have a special meaning,
+    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
+    StringBuilder escaped = new StringBuilder();
+    for (int i = 0; i < re.length(); i++) {
+      char c = re.charAt(i);
+      if (c == '-') {
+        escaped.append("\\-");
+      } else {
+        escaped.append(c);
+        if (c == '\\' && i + 1 < re.length()) {
+          escaped.append(re.charAt(i+1));
+          i++;
+        }
+      }
+    }
+    return escaped.toString();
+  }
 
   /**
    * Parses a specific affix rule putting the result into the provided affix map
@@ -425,7 +444,7 @@ private void parseAffix(TreeMap<String,List<Character>> affixes,
       }
       // "dash hasn't got special meaning" (we must escape it)
       if (condition.indexOf('-') >= 0) {
-        condition = condition.replace("-", "\\-");
+        condition = escapeDash(condition);
       }
 
       final String regex;

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDoubleEscape.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDoubleEscape.java
@@ -0,0 +1,31 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.junit.BeforeClass;
+
+public class TestDoubleEscape extends StemmerTestBase {
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("double-escaped.aff", "double-escaped.dic");
+  }
+
+  public void testStemming() {
+    assertStemsTo("adubo", "adubar");
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/double-escaped.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/double-escaped.aff
@@ -0,0 +1,5 @@
+SET UTF-8
+
+SFX X Y 1
+SFX X   ar         o          [^\-]ar
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/double-escaped.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/double-escaped.dic
@@ -0,0 +1,2 @@
+1
+adubar/X
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
@@ -42,10 +42,12 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.DoubleBarrelLRUCache;
+import org.apache.lucene.util.RamUsageEstimator;
 
 /** Handles a terms dict, but decouples all details of
  *  doc/freqs/positions reading to an instance of {@link
@@ -60,6 +62,9 @@
  * @lucene.experimental */
 
 public class BlockTermsReader extends FieldsProducer {
+
+  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(BlockTermsReader.class);
+
   // Open input to the main terms dict file (_X.tis)
   private final IndexInput in;
 
@@ -227,7 +232,8 @@ public int size() {
     return fields.size();
   }
 
-  private class FieldReader extends Terms {
+  private static final long FIELD_READER_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FieldReader.class);
+  private class FieldReader extends Terms implements Accountable {
     final long numTerms;
     final FieldInfo fieldInfo;
     final long termsStartPointer;
@@ -247,6 +253,11 @@ private class FieldReader extends Terms {
       this.longsSize = longsSize;
     }
 
+    @Override
+    public long ramBytesUsed() {
+      return FIELD_READER_RAM_BYTES_USED;
+    }
+
     @Override
     public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
@@ -881,9 +892,14 @@ private void decodeMetaData() throws IOException {
 
   @Override
   public long ramBytesUsed() {
-    long sizeInBytes = (postingsReader!=null) ? postingsReader.ramBytesUsed() : 0;
-    sizeInBytes += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
-    return sizeInBytes;
+    long ramBytesUsed = BASE_RAM_BYTES_USED;
+    ramBytesUsed += (postingsReader!=null) ? postingsReader.ramBytesUsed() : 0;
+    ramBytesUsed += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
+    ramBytesUsed += fields.size() * 2L * RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+    for (FieldReader reader : fields.values()) {
+      ramBytesUsed += reader.ramBytesUsed();
+    }
+    return ramBytesUsed;
   }
 
   @Override

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java
@@ -28,6 +28,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.packed.PackedInts;
 
 import java.util.HashMap;
@@ -44,6 +45,8 @@
  */
 public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
 
+  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedGapTermsIndexReader.class);
+
   // NOTE: long is overkill here, since this number is 128
   // by default and only indexDivisor * 128 if you change
   // the indexDivisor at search time.  But, we use this in a
@@ -65,8 +68,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
   private final static int PAGED_BYTES_BITS = 15;
 
   // all fields share this single logical byte[]
-  private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
-  private PagedBytes.Reader termBytesReader;
+  private final PagedBytes.Reader termBytesReader;
 
   final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<>();
 
@@ -83,9 +85,9 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
     assert indexDivisor == -1 || indexDivisor > 0;
 
     in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);
-
-    boolean success = false;
+    final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
 
+    boolean success = false;
     try {
 
       version = readHeader(in);
@@ -130,7 +132,7 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
           throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
         }
         final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
+        FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, termBytes, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
         if (previous != null) {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
         }
@@ -147,6 +149,8 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
           indexLoaded = true;
         }
         termBytesReader = termBytes.freeze(true);
+      } else {
+        termBytesReader = null;
       }
     }
   }
@@ -254,7 +258,8 @@ public boolean supportsOrd() {
     return true;
   }
 
-  private final class FieldIndexData {
+  private static final long FIELD_INDEX_DATA_BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FieldIndexData.class);
+  private final class FieldIndexData implements Accountable {
 
     volatile CoreFieldIndex coreIndex;
 
@@ -265,7 +270,7 @@ private final class FieldIndexData {
 
     private final int numIndexTerms;
 
-    public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
+    public FieldIndexData(FieldInfo fieldInfo, PagedBytes termBytes, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
                           long packedOffsetsStart) throws IOException {
 
       this.termsStart = termsStart;
@@ -275,13 +280,18 @@ public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, l
       this.numIndexTerms = numIndexTerms;
 
       if (indexDivisor > 0) {
-        loadTermsIndex();
+        loadTermsIndex(termBytes);
       }
     }
 
-    private void loadTermsIndex() throws IOException {
+    @Override
+    public long ramBytesUsed() {
+      return FIELD_INDEX_DATA_BASE_RAM_BYTES_USED + coreIndex.ramBytesUsed();
+    }
+
+    private void loadTermsIndex(PagedBytes termBytes) throws IOException {
       if (coreIndex == null) {
-        coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
+        coreIndex = new CoreFieldIndex(termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
       }
     }
 
@@ -300,7 +310,7 @@ private final class CoreFieldIndex implements Accountable {
       final int numIndexTerms;
       final long termsStart;
 
-      public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
+      public CoreFieldIndex(PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
 
         this.termsStart = termsStart;
         termBytesStart = termBytes.getPointer();
@@ -439,10 +449,11 @@ private void seekDir(IndexInput input, long dirOffset) throws IOException {
 
   @Override
   public long ramBytesUsed() {
-    long sizeInBytes = ((termBytes!=null) ? termBytes.ramBytesUsed() : 0) +
-        ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
+    long sizeInBytes = BASE_RAM_BYTES_USED
+        + fields.size() * 2L * RamUsageEstimator.NUM_BYTES_OBJECT_REF
+        + ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
     for(FieldIndexData entry : fields.values()) {
-      sizeInBytes += entry.coreIndex.ramBytesUsed();
+      sizeInBytes += entry.ramBytesUsed();
     }
     return sizeInBytes;
   }

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
@@ -155,7 +155,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state)
     return new BloomFilteredFieldsProducer(state);
   }
 
-  public class BloomFilteredFieldsProducer extends FieldsProducer {
+  static class BloomFilteredFieldsProducer extends FieldsProducer {
     private FieldsProducer delegateFieldsProducer;
     HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<>();
 

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
@@ -179,11 +179,16 @@ public void checkIntegrity() throws IOException {
 
   private final static class DirectField extends Terms implements Accountable {
 
+    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(DirectField.class);
+
     private static abstract class TermAndSkip implements Accountable {
       public int[] skips;
     }
 
     private static final class LowFreqTerm extends TermAndSkip {
+
+      private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
+
       public final int[] postings;
       public final byte[] payloads;
       public final int docFreq;
@@ -198,13 +203,17 @@ public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFr
 
       @Override
       public long ramBytesUsed() {
-        return ((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) + 
+        return BASE_RAM_BYTES_USED +
+            ((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) + 
             ((payloads!=null) ? RamUsageEstimator.sizeOf(payloads) : 0);
       }
     }
 
     // TODO: maybe specialize into prx/no-prx/no-frq cases?
     private static final class HighFreqTerm extends TermAndSkip {
+
+      private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
+
       public final long totalTermFreq;
       public final int[] docIDs;
       public final int[] freqs;
@@ -221,19 +230,22 @@ public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] pay
 
       @Override
       public long ramBytesUsed() {
-         long sizeInBytes = 0;
+         long sizeInBytes = BASE_RAM_BYTES_USED;
          sizeInBytes += (docIDs!=null)? RamUsageEstimator.sizeOf(docIDs) : 0;
          sizeInBytes += (freqs!=null)? RamUsageEstimator.sizeOf(freqs) : 0;
 
          if(positions != null) {
+           sizeInBytes += RamUsageEstimator.shallowSizeOf(positions);
            for(int[] position : positions) {
              sizeInBytes += (position!=null) ? RamUsageEstimator.sizeOf(position) : 0;
            }
          }
 
          if (payloads != null) {
+           sizeInBytes += RamUsageEstimator.shallowSizeOf(payloads);
            for(byte[][] payload : payloads) {
              if(payload != null) {
+               sizeInBytes += RamUsageEstimator.shallowSizeOf(payload);
                for(byte[] pload : payload) {
                  sizeInBytes += (pload!=null) ? RamUsageEstimator.sizeOf(pload) : 0; 
                }
@@ -504,14 +516,15 @@ public DirectField(SegmentReadState state, String field, Terms termsIn, int minS
 
     @Override
     public long ramBytesUsed() {
-      long sizeInBytes = 0;
+      long sizeInBytes = BASE_RAM_BYTES_USED;
       sizeInBytes += ((termBytes!=null) ? RamUsageEstimator.sizeOf(termBytes) : 0);
       sizeInBytes += ((termOffsets!=null) ? RamUsageEstimator.sizeOf(termOffsets) : 0);
       sizeInBytes += ((skips!=null) ? RamUsageEstimator.sizeOf(skips) : 0);
       sizeInBytes += ((skipOffsets!=null) ? RamUsageEstimator.sizeOf(skipOffsets) : 0);
       sizeInBytes += ((sameCounts!=null) ? RamUsageEstimator.sizeOf(sameCounts) : 0);
 
       if(terms!=null) {
+        sizeInBytes += RamUsageEstimator.shallowSizeOf(terms);
         for(TermAndSkip termAndSkip : terms) {
           sizeInBytes += (termAndSkip!=null) ? termAndSkip.ramBytesUsed() : 0;
         }

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java
@@ -840,7 +840,7 @@ static<T> void walk(FST<T> fst) throws IOException {
 
   @Override
   public long ramBytesUsed() {
-    long ramBytesUsed = 0;
+    long ramBytesUsed = postingsReader.ramBytesUsed();
     for (TermsReader r : fields.values()) {
       if (r.index != null) {
         ramBytesUsed += r.index.ramBytesUsed();