Skip to content

Commit b4f521a

Browse files
committed
Testing TfIdfCalculator
1 parent fb97d33 commit b4f521a

File tree

6 files changed

+102
-1
lines changed

6 files changed

+102
-1
lines changed

src/main/java/de/gwdg/metadataqa/api/calculator/TfIdfCalculator.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import de.gwdg.metadataqa.api.model.pathcache.PathCache;
88
import de.gwdg.metadataqa.api.problemcatalog.FieldCounterBasedResult;
99
import de.gwdg.metadataqa.api.schema.Schema;
10+
import de.gwdg.metadataqa.api.uniqueness.SolrClient;
1011
import de.gwdg.metadataqa.api.uniqueness.SolrConfiguration;
1112
import de.gwdg.metadataqa.api.uniqueness.TfIdf;
1213
import de.gwdg.metadataqa.api.uniqueness.TfIdfExtractor;
@@ -60,6 +61,7 @@ public class TfIdfCalculator implements Calculator, Serializable {
6061
private Map<String, List<TfIdf>> termsCollection;
6162
private boolean termCollectionEnabled = false;
6263
private Schema schema;
64+
private SolrClient solrClient;
6365

6466
public TfIdfCalculator() {
6567
}
@@ -80,7 +82,9 @@ public List<MetricResult> measure(PathCache cache) {
8082
recordId = recordId.substring(1);
8183
}
8284

83-
String solrJsonResponse = getSolrResponse(recordId);
85+
String solrJsonResponse = solrClient != null
86+
? solrClient.getTfIdfResponse(String.format(SOLR_SEARCH_PARAMS, recordId).replace("\"", "%22"), recordId)
87+
: getSolrResponse(recordId);
8488
var extractor = new TfIdfExtractor(schema);
8589
FieldCounter<Double> resultMap = extractor.extract(solrJsonResponse, recordId, termCollectionEnabled);
8690
termsCollection = extractor.getTermsCollection();
@@ -153,4 +157,8 @@ public String getSolrSearchPath() {
153157
}
154158
return this.solrSearchPath;
155159
}
160+
161+
public void setSolrClient(SolrClient solrClient) {
162+
this.solrClient = solrClient;
163+
}
156164
}

src/main/java/de/gwdg/metadataqa/api/uniqueness/DefaultSolrClient.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ public String getSolrSearchResponse(String solrField, String value) {
5353
return connect(url, solrField, value);
5454
}
5555

56+
@Override
57+
public String getTfIdfResponse(String params, String recordId) {
58+
return connect(getSolrBasePath() + params, "tf-idf", recordId);
59+
}
60+
5661
public String buildUrl(String solrField, String value) {
5762
String url;
5863
if (value.equals("*")) {

src/main/java/de/gwdg/metadataqa/api/uniqueness/SolrClient.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
public interface SolrClient extends Serializable {
1111
String getSolrSearchResponse(String solrField, String value);
12+
String getTfIdfResponse(String params, String recordId);
1213
void indexMap(String id, Map<String, List<String>> objectMap) throws IOException, SolrServerException;
1314
void commit();
1415
void deleteAll();

src/main/java/de/gwdg/metadataqa/api/uniqueness/TfIdfExtractor.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ public FieldCounter<Double> extract(String jsonString, String recordId, boolean
6262
Object document = JSON_PROVIDER.parse(jsonString);
6363
var path = String.format("$.termVectors.['%s']", recordId);
6464
Map value = (LinkedHashMap) JsonPath.read(document, path);
65+
6566
for (JsonBranch jsonBranch : schema.getIndexFields()) {
6667
if (doCollectTerms) {
6768
termsCollection.put(jsonBranch.getJsonPath(), new ArrayList<>());

src/test/java/de/gwdg/metadataqa/api/calculator/TfIdfCalculatorTest.java

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
11
package de.gwdg.metadataqa.api.calculator;
22

3+
import de.gwdg.metadataqa.api.interfaces.MetricResult;
4+
import de.gwdg.metadataqa.api.json.JsonBranch;
5+
import de.gwdg.metadataqa.api.model.PathCacheFactory;
6+
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
7+
import de.gwdg.metadataqa.api.model.pathcache.CsvPathCache;
8+
import de.gwdg.metadataqa.api.schema.BaseSchema;
9+
import de.gwdg.metadataqa.api.schema.CsvAwareSchema;
10+
import de.gwdg.metadataqa.api.schema.Format;
11+
import de.gwdg.metadataqa.api.schema.Schema;
312
import de.gwdg.metadataqa.api.schema.edm.EdmFullBeanSchema;
413
import de.gwdg.metadataqa.api.schema.edm.EdmOaiPmhJsonSchema;
514
import java.util.Arrays;
615
import java.util.List;
16+
17+
import de.gwdg.metadataqa.api.uniqueness.SolrClient;
18+
import de.gwdg.metadataqa.api.uniqueness.SolrClientMock;
19+
import de.gwdg.metadataqa.api.uniqueness.SolrConfiguration;
20+
import de.gwdg.metadataqa.api.util.CsvReader;
721
import org.junit.Test;
822
import static org.junit.Assert.*;
923

@@ -34,4 +48,63 @@ public void getCalculatorName() throws Exception {
3448
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
3549
assertEquals("uniqueness", calculator.getCalculatorName());
3650
}
51+
52+
@Test
53+
public void getSolrSearchPath() throws Exception {
54+
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
55+
calculator.setSolrConfiguration(new SolrConfiguration());
56+
57+
assertEquals(
58+
"http://localhost:8983/solr/europeana/tvrh/?q=id:\"%s\"&version=2.2&indent=on&qt=tvrh&tv=true&tv.all=true" +
59+
"&f.includes.tv.tf=true&tv.fl=dc_title_txt,dc_description_txt,dcterms_alternative_txt&wt=json&json.nl=map&rows=1000&fl=id",
60+
calculator.getSolrSearchPath());
61+
}
62+
63+
@Test
64+
public void isTermCollectionEnabled() throws Exception {
65+
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
66+
assertFalse(calculator.isTermCollectionEnabled());
67+
}
68+
69+
@Test
70+
public void enableTermCollection() throws Exception {
71+
TfIdfCalculator calculator = new TfIdfCalculator(new EdmOaiPmhJsonSchema());
72+
calculator.enableTermCollection(true);
73+
assertTrue(calculator.isTermCollectionEnabled());
74+
}
75+
76+
@Test
77+
public void emptyContructor() throws Exception {
78+
TfIdfCalculator calculator = new TfIdfCalculator();
79+
assertNotNull(calculator);
80+
}
81+
82+
@Test
83+
public void measure() throws Exception {
84+
SolrConfiguration solrConfiguration = new SolrConfiguration("localhost", "8983", "solr");
85+
Schema schema = getSchema(Format.CSV);
86+
SolrClient solrClient = new SolrClientMock(solrConfiguration);
87+
CsvPathCache cache = (CsvPathCache) PathCacheFactory.getInstance(schema.getFormat(), "URL,two three");
88+
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
89+
cache.setRecordId(((List<XmlFieldInstance>)cache.get(schema.getRecordId().getJsonPath())).get(0).getValue());
90+
91+
TfIdfCalculator calculator = new TfIdfCalculator(schema);
92+
calculator.setSolrClient(solrClient);
93+
List<MetricResult> result = calculator.measure(cache);
94+
assertNotNull(result);
95+
assertEquals(1, result.size());
96+
assertEquals(0.008826999437345252, result.get(0).getResultMap().get("url:sum"));
97+
assertEquals(0.0017653998874690505, result.get(0).getResultMap().get("url:avg"));
98+
assertEquals(0.0, result.get(0).getResultMap().get("name:sum"));
99+
assertEquals(0.0, result.get(0).getResultMap().get("name:avg"));
100+
}
101+
102+
private Schema getSchema(Format format) {
103+
BaseSchema schema = new BaseSchema()
104+
.setFormat(format)
105+
.addField(new JsonBranch("url").setExtractable().setIndexField("url"))
106+
.addField(new JsonBranch("name").setExtractable().setIndexField("name"));
107+
schema.setRecordId(schema.getPathByLabel("url"));
108+
return schema;
109+
}
37110
}

src/test/java/de/gwdg/metadataqa/api/uniqueness/SolrClientMock.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,19 @@ public String getSolrSearchResponse(String solrField, String value) {
4141
return null;
4242
}
4343

44+
public String getTfIdfResponse(String params, String recordId) {
45+
return "{\"responseHeader\":{\"status\":0,\"QTime\":74}," +
46+
"\"response\":{\"numFound\":1,\"start\":0,\"docs\":[{\"id\":\"2022320/3F61C612ED9C42CCB85E533B4736795E8BDC7E77\"}]}," +
47+
"\"termVectors\":{\"warnings\":{\"noPayloads\":[\"dc_title_txt\",\"dc_description_txt\",\"dcterms_alternative_txt\"]}," +
48+
"\"URL\":{\"uniqueKey\":\"2022320/3F61C612ED9C42CCB85E533B4736795E8BDC7E77\"," +
49+
"\"url\":{" +
50+
"\"fleming\":{\"tf\":1,\"positions\":{\"position\":0},\"offsets\":{\"start\":0,\"end\":7},\"df\":1073,\"tf-idf\":9.319664492078285E-4}," +
51+
"\"huddersfield\":{\"tf\":1,\"positions\":{\"position\":4},\"offsets\":{\"start\":35,\"end\":47},\"df\":12073,\"tf-idf\":8.282945415389712E-5}," +
52+
"\"mair\":{\"tf\":1,\"positions\":{\"position\":1},\"offsets\":{\"start\":8,\"end\":12},\"df\":178,\"tf-idf\":0.0056179775280898875}," +
53+
"\"slaithwaite\":{\"tf\":1,\"positions\":{\"position\":3},\"offsets\":{\"start\":22,\"end\":33},\"df\":477,\"tf-idf\":0.0020964360587002098}," +
54+
"\"wedding\":{\"tf\":1,\"positions\":{\"position\":2},\"offsets\":{\"start\":13,\"end\":20},\"df\":10226,\"tf-idf\":9.778994719342852E-5}}}}}";
55+
}
56+
4457
@Override
4558
public void indexMap(String id, Map<String, List<String>> objectMap) throws IOException, SolrServerException {
4659
this.id = id;

0 commit comments

Comments
 (0)