Skip to content

Commit

Permalink
Merge remote-tracking branch 'asf/branch_4x' into helio
Browse files Browse the repository at this point in the history
  • Loading branch information
yonik committed Jun 19, 2014
2 parents 39a82e7 + df4d8f8 commit 825ac1a
Show file tree
Hide file tree
Showing 77 changed files with 1,509 additions and 823 deletions.
4 changes: 3 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ API Changes
* LUCENE-5761: Removed DiskDocValuesFormat, it was very inefficient and saved very little
RAM over the default codec. (Robert Muir)

* LUCENE-5775: Deprecate JaspellLookup. (Mike McCandless)

Optimizations

* LUCENE-5603: hunspell stemmer more efficiently strips prefixes
Expand Down Expand Up @@ -246,7 +248,7 @@ Bug fixes
* LUCENE-5747: Project-specific settings for the eclipse development
environment will prevent automatic code reformatting. (Shawn Heisey)

* LUCENE-5768: Hunspell condition checks containing character classes
* LUCENE-5768, LUCENE-5777: Hunspell condition checks containing character classes
were buggy. (Clinton Gormley, Robert Muir)

Test Framework
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,25 @@ private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IO
}
return builder.finish();
}

static String escapeDash(String re) {
// we have to be careful, even though dash doesn't have a special meaning,
// some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
StringBuilder escaped = new StringBuilder();
for (int i = 0; i < re.length(); i++) {
char c = re.charAt(i);
if (c == '-') {
escaped.append("\\-");
} else {
escaped.append(c);
if (c == '\\' && i + 1 < re.length()) {
escaped.append(re.charAt(i+1));
i++;
}
}
}
return escaped.toString();
}

/**
* Parses a specific affix rule putting the result into the provided affix map
Expand Down Expand Up @@ -425,7 +444,7 @@ private void parseAffix(TreeMap<String,List<Character>> affixes,
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.indexOf('-') >= 0) {
condition = condition.replace("-", "\\-");
condition = escapeDash(condition);
}

final String regex;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.apache.lucene.analysis.hunspell;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.junit.BeforeClass;

public class TestDoubleEscape extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("double-escaped.aff", "double-escaped.dic");
}

public void testStemming() {
assertStemsTo("adubo", "adubar");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SET UTF-8

SFX X Y 1
SFX X ar o [^\-]ar

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1
adubar/X
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DoubleBarrelLRUCache;
import org.apache.lucene.util.RamUsageEstimator;

/** Handles a terms dict, but decouples all details of
* doc/freqs/positions reading to an instance of {@link
Expand All @@ -60,6 +62,9 @@
* @lucene.experimental */

public class BlockTermsReader extends FieldsProducer {

private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(BlockTermsReader.class);

// Open input to the main terms dict file (_X.tis)
private final IndexInput in;

Expand Down Expand Up @@ -227,7 +232,8 @@ public int size() {
return fields.size();
}

private class FieldReader extends Terms {
private static final long FIELD_READER_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FieldReader.class);
private class FieldReader extends Terms implements Accountable {
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
Expand All @@ -247,6 +253,11 @@ private class FieldReader extends Terms {
this.longsSize = longsSize;
}

@Override
public long ramBytesUsed() {
return FIELD_READER_RAM_BYTES_USED;
}

@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
Expand Down Expand Up @@ -881,9 +892,14 @@ private void decodeMetaData() throws IOException {

@Override
public long ramBytesUsed() {
long sizeInBytes = (postingsReader!=null) ? postingsReader.ramBytesUsed() : 0;
sizeInBytes += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
return sizeInBytes;
long ramBytesUsed = BASE_RAM_BYTES_USED;
ramBytesUsed += (postingsReader!=null) ? postingsReader.ramBytesUsed() : 0;
ramBytesUsed += (indexReader!=null) ? indexReader.ramBytesUsed() : 0;
ramBytesUsed += fields.size() * 2L * RamUsageEstimator.NUM_BYTES_OBJECT_REF;
for (FieldReader reader : fields.values()) {
ramBytesUsed += reader.ramBytesUsed();
}
return ramBytesUsed;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;

import java.util.HashMap;
Expand All @@ -44,6 +45,8 @@
*/
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {

private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FixedGapTermsIndexReader.class);

// NOTE: long is overkill here, since this number is 128
// by default and only indexDivisor * 128 if you change
// the indexDivisor at search time. But, we use this in a
Expand All @@ -65,8 +68,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
private final static int PAGED_BYTES_BITS = 15;

// all fields share this single logical byte[]
private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
private PagedBytes.Reader termBytesReader;
private final PagedBytes.Reader termBytesReader;

final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<>();

Expand All @@ -83,9 +85,9 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
assert indexDivisor == -1 || indexDivisor > 0;

in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context);

boolean success = false;
final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);

boolean success = false;
try {

version = readHeader(in);
Expand Down Expand Up @@ -130,7 +132,7 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")");
}
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo, termBytes, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
}
Expand All @@ -147,6 +149,8 @@ public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String seg
indexLoaded = true;
}
termBytesReader = termBytes.freeze(true);
} else {
termBytesReader = null;
}
}
}
Expand Down Expand Up @@ -254,7 +258,8 @@ public boolean supportsOrd() {
return true;
}

private final class FieldIndexData {
private static final long FIELD_INDEX_DATA_BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FieldIndexData.class);
private final class FieldIndexData implements Accountable {

volatile CoreFieldIndex coreIndex;

Expand All @@ -265,7 +270,7 @@ private final class FieldIndexData {

private final int numIndexTerms;

public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
public FieldIndexData(FieldInfo fieldInfo, PagedBytes termBytes, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
long packedOffsetsStart) throws IOException {

this.termsStart = termsStart;
Expand All @@ -275,13 +280,18 @@ public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, l
this.numIndexTerms = numIndexTerms;

if (indexDivisor > 0) {
loadTermsIndex();
loadTermsIndex(termBytes);
}
}

private void loadTermsIndex() throws IOException {
@Override
public long ramBytesUsed() {
return FIELD_INDEX_DATA_BASE_RAM_BYTES_USED + coreIndex.ramBytesUsed();
}

private void loadTermsIndex(PagedBytes termBytes) throws IOException {
if (coreIndex == null) {
coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
coreIndex = new CoreFieldIndex(termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
}
}

Expand All @@ -300,7 +310,7 @@ private final class CoreFieldIndex implements Accountable {
final int numIndexTerms;
final long termsStart;

public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
public CoreFieldIndex(PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {

this.termsStart = termsStart;
termBytesStart = termBytes.getPointer();
Expand Down Expand Up @@ -439,10 +449,11 @@ private void seekDir(IndexInput input, long dirOffset) throws IOException {

@Override
public long ramBytesUsed() {
long sizeInBytes = ((termBytes!=null) ? termBytes.ramBytesUsed() : 0) +
((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
long sizeInBytes = BASE_RAM_BYTES_USED
+ fields.size() * 2L * RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
for(FieldIndexData entry : fields.values()) {
sizeInBytes += entry.coreIndex.ramBytesUsed();
sizeInBytes += entry.ramBytesUsed();
}
return sizeInBytes;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state)
return new BloomFilteredFieldsProducer(state);
}

public class BloomFilteredFieldsProducer extends FieldsProducer {
static class BloomFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,16 @@ public void checkIntegrity() throws IOException {

private final static class DirectField extends Terms implements Accountable {

private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(DirectField.class);

private static abstract class TermAndSkip implements Accountable {
public int[] skips;
}

private static final class LowFreqTerm extends TermAndSkip {

private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);

public final int[] postings;
public final byte[] payloads;
public final int docFreq;
Expand All @@ -198,13 +203,17 @@ public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFr

@Override
public long ramBytesUsed() {
return ((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) +
return BASE_RAM_BYTES_USED +
((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) +
((payloads!=null) ? RamUsageEstimator.sizeOf(payloads) : 0);
}
}

// TODO: maybe specialize into prx/no-prx/no-frq cases?
private static final class HighFreqTerm extends TermAndSkip {

private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);

public final long totalTermFreq;
public final int[] docIDs;
public final int[] freqs;
Expand All @@ -221,19 +230,22 @@ public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] pay

@Override
public long ramBytesUsed() {
long sizeInBytes = 0;
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += (docIDs!=null)? RamUsageEstimator.sizeOf(docIDs) : 0;
sizeInBytes += (freqs!=null)? RamUsageEstimator.sizeOf(freqs) : 0;

if(positions != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(positions);
for(int[] position : positions) {
sizeInBytes += (position!=null) ? RamUsageEstimator.sizeOf(position) : 0;
}
}

if (payloads != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payloads);
for(byte[][] payload : payloads) {
if(payload != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payload);
for(byte[] pload : payload) {
sizeInBytes += (pload!=null) ? RamUsageEstimator.sizeOf(pload) : 0;
}
Expand Down Expand Up @@ -504,14 +516,15 @@ public DirectField(SegmentReadState state, String field, Terms termsIn, int minS

@Override
public long ramBytesUsed() {
long sizeInBytes = 0;
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += ((termBytes!=null) ? RamUsageEstimator.sizeOf(termBytes) : 0);
sizeInBytes += ((termOffsets!=null) ? RamUsageEstimator.sizeOf(termOffsets) : 0);
sizeInBytes += ((skips!=null) ? RamUsageEstimator.sizeOf(skips) : 0);
sizeInBytes += ((skipOffsets!=null) ? RamUsageEstimator.sizeOf(skipOffsets) : 0);
sizeInBytes += ((sameCounts!=null) ? RamUsageEstimator.sizeOf(sameCounts) : 0);

if(terms!=null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(terms);
for(TermAndSkip termAndSkip : terms) {
sizeInBytes += (termAndSkip!=null) ? termAndSkip.ramBytesUsed() : 0;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ static<T> void walk(FST<T> fst) throws IOException {

@Override
public long ramBytesUsed() {
long ramBytesUsed = 0;
long ramBytesUsed = postingsReader.ramBytesUsed();
for (TermsReader r : fields.values()) {
if (r.index != null) {
ramBytesUsed += r.index.ramBytesUsed();
Expand Down
Loading

0 comments on commit 825ac1a

Please sign in to comment.