Skip to content

Commit

Permalink
Testing TfIdfCalculator
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 20, 2022
1 parent fb97d33 commit b4f521a
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import de.gwdg.metadataqa.api.model.pathcache.PathCache;
import de.gwdg.metadataqa.api.problemcatalog.FieldCounterBasedResult;
import de.gwdg.metadataqa.api.schema.Schema;
import de.gwdg.metadataqa.api.uniqueness.SolrClient;
import de.gwdg.metadataqa.api.uniqueness.SolrConfiguration;
import de.gwdg.metadataqa.api.uniqueness.TfIdf;
import de.gwdg.metadataqa.api.uniqueness.TfIdfExtractor;
Expand Down Expand Up @@ -60,6 +61,7 @@ public class TfIdfCalculator implements Calculator, Serializable {
private Map<String, List<TfIdf>> termsCollection;
private boolean termCollectionEnabled = false;
private Schema schema;
private SolrClient solrClient;

public TfIdfCalculator() {
}
Expand All @@ -80,7 +82,9 @@ public List<MetricResult> measure(PathCache cache) {
recordId = recordId.substring(1);
}

String solrJsonResponse = getSolrResponse(recordId);
String solrJsonResponse = solrClient != null
? solrClient.getTfIdfResponse(String.format(SOLR_SEARCH_PARAMS, recordId).replace("\"", "%22"), recordId)
: getSolrResponse(recordId);
var extractor = new TfIdfExtractor(schema);
FieldCounter<Double> resultMap = extractor.extract(solrJsonResponse, recordId, termCollectionEnabled);
termsCollection = extractor.getTermsCollection();
Expand Down Expand Up @@ -153,4 +157,8 @@ public String getSolrSearchPath() {
}
return this.solrSearchPath;
}

public void setSolrClient(SolrClient solrClient) {
this.solrClient = solrClient;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ public String getSolrSearchResponse(String solrField, String value) {
return connect(url, solrField, value);
}

@Override
public String getTfIdfResponse(String params, String recordId) {
return connect(getSolrBasePath() + params, "tf-idf", recordId);
}

public String buildUrl(String solrField, String value) {
String url;
if (value.equals("*")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

public interface SolrClient extends Serializable {
String getSolrSearchResponse(String solrField, String value);
String getTfIdfResponse(String params, String recordId);
void indexMap(String id, Map<String, List<String>> objectMap) throws IOException, SolrServerException;
void commit();
void deleteAll();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public FieldCounter<Double> extract(String jsonString, String recordId, boolean
Object document = JSON_PROVIDER.parse(jsonString);
var path = String.format("$.termVectors.['%s']", recordId);
Map value = (LinkedHashMap) JsonPath.read(document, path);

for (JsonBranch jsonBranch : schema.getIndexFields()) {
if (doCollectTerms) {
termsCollection.put(jsonBranch.getJsonPath(), new ArrayList<>());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
package de.gwdg.metadataqa.api.calculator;

import de.gwdg.metadataqa.api.interfaces.MetricResult;
import de.gwdg.metadataqa.api.json.JsonBranch;
import de.gwdg.metadataqa.api.model.PathCacheFactory;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
import de.gwdg.metadataqa.api.model.pathcache.CsvPathCache;
import de.gwdg.metadataqa.api.schema.BaseSchema;
import de.gwdg.metadataqa.api.schema.CsvAwareSchema;
import de.gwdg.metadataqa.api.schema.Format;
import de.gwdg.metadataqa.api.schema.Schema;
import de.gwdg.metadataqa.api.schema.edm.EdmFullBeanSchema;
import de.gwdg.metadataqa.api.schema.edm.EdmOaiPmhJsonSchema;
import java.util.Arrays;
import java.util.List;

import de.gwdg.metadataqa.api.uniqueness.SolrClient;
import de.gwdg.metadataqa.api.uniqueness.SolrClientMock;
import de.gwdg.metadataqa.api.uniqueness.SolrConfiguration;
import de.gwdg.metadataqa.api.util.CsvReader;
import org.junit.Test;
import static org.junit.Assert.*;

Expand Down Expand Up @@ -34,4 +48,63 @@ public void getCalculatorName() throws Exception {
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
assertEquals("uniqueness", calculator.getCalculatorName());
}

@Test
public void getSolrSearchPath() throws Exception {
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
calculator.setSolrConfiguration(new SolrConfiguration());

assertEquals(
"http://localhost:8983/solr/europeana/tvrh/?q=id:\"%s\"&version=2.2&indent=on&qt=tvrh&tv=true&tv.all=true" +
"&f.includes.tv.tf=true&tv.fl=dc_title_txt,dc_description_txt,dcterms_alternative_txt&wt=json&json.nl=map&rows=1000&fl=id",
calculator.getSolrSearchPath());
}

@Test
public void isTermCollectionEnabled() throws Exception {
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
assertFalse(calculator.isTermCollectionEnabled());
}

@Test
public void enableTermCollection() throws Exception {
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
calculator.enableTermCollection(true);
assertTrue(calculator.isTermCollectionEnabled());
}

@Test
public void emptyContructor() throws Exception {
TfIdfCalculator calculator = new TfIdfCalculator();
assertNotNull(calculator);
}

@Test
public void measure() throws Exception {
SolrConfiguration solrConfiguration = new SolrConfiguration("localhost", "8983", "solr");
Schema schema = getSchema(Format.CSV);
SolrClient solrClient = new SolrClientMock(solrConfiguration);
CsvPathCache cache = (CsvPathCache) PathCacheFactory.getInstance(schema.getFormat(), "URL,two three");
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
cache.setRecordId(((List<XmlFieldInstance>)cache.get(schema.getRecordId().getJsonPath())).get(0).getValue());

TfIdfCalculator calculator = new TfIdfCalculator(schema);
calculator.setSolrClient(solrClient);
List<MetricResult> result = calculator.measure(cache);
assertNotNull(result);
assertEquals(1, result.size());
assertEquals(0.008826999437345252, result.get(0).getResultMap().get("url:sum"));
assertEquals(0.0017653998874690505, result.get(0).getResultMap().get("url:avg"));
assertEquals(0.0, result.get(0).getResultMap().get("name:sum"));
assertEquals(0.0, result.get(0).getResultMap().get("name:avg"));
}

private Schema getSchema(Format format) {
BaseSchema schema = new BaseSchema()
.setFormat(format)
.addField(new JsonBranch("url").setExtractable().setIndexField("url"))
.addField(new JsonBranch("name").setExtractable().setIndexField("name"));
schema.setRecordId(schema.getPathByLabel("url"));
return schema;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,19 @@ public String getSolrSearchResponse(String solrField, String value) {
return null;
}

public String getTfIdfResponse(String params, String recordId) {
return "{\"responseHeader\":{\"status\":0,\"QTime\":74}," +
"\"response\":{\"numFound\":1,\"start\":0,\"docs\":[{\"id\":\"2022320/3F61C612ED9C42CCB85E533B4736795E8BDC7E77\"}]}," +
"\"termVectors\":{\"warnings\":{\"noPayloads\":[\"dc_title_txt\",\"dc_description_txt\",\"dcterms_alternative_txt\"]}," +
"\"URL\":{\"uniqueKey\":\"2022320/3F61C612ED9C42CCB85E533B4736795E8BDC7E77\"," +
"\"url\":{" +
"\"fleming\":{\"tf\":1,\"positions\":{\"position\":0},\"offsets\":{\"start\":0,\"end\":7},\"df\":1073,\"tf-idf\":9.319664492078285E-4}," +
"\"huddersfield\":{\"tf\":1,\"positions\":{\"position\":4},\"offsets\":{\"start\":35,\"end\":47},\"df\":12073,\"tf-idf\":8.282945415389712E-5}," +
"\"mair\":{\"tf\":1,\"positions\":{\"position\":1},\"offsets\":{\"start\":8,\"end\":12},\"df\":178,\"tf-idf\":0.0056179775280898875}," +
"\"slaithwaite\":{\"tf\":1,\"positions\":{\"position\":3},\"offsets\":{\"start\":22,\"end\":33},\"df\":477,\"tf-idf\":0.0020964360587002098}," +
"\"wedding\":{\"tf\":1,\"positions\":{\"position\":2},\"offsets\":{\"start\":13,\"end\":20},\"df\":10226,\"tf-idf\":9.778994719342852E-5}}}}}";
}

@Override
public void indexMap(String id, Map<String, List<String>> objectMap) throws IOException, SolrServerException {
this.id = id;
Expand Down

0 comments on commit b4f521a

Please sign in to comment.