Skip to content

Commit e840662

Browse files
committed
Add validLink
1 parent bc224f3 commit e840662

File tree

9 files changed

+396
-15
lines changed

9 files changed

+396
-15
lines changed

README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ digital collections.
3737
* [`not [<rule1>, ..., <ruleN>]`](#not-rule1--rulen)
3838
- [Other constraints](#other-constraints)
3939
* [`contentType [type1, ..., typeN]`](#contenttype-type1--typen)
40+
* [`validLink <boolean>`](#validLink-boolean)
4041
* [`unique <boolean>`](#unique-boolean)
4142
* [`dependencies [id1, id2, ..., idN]`](#dependencies-id1-id2--idn)
4243
* [`dimension [criteria1, criteria2, ..., criteriaN]`](#dimension-criteria1-criteria2--criterian)
@@ -837,6 +838,38 @@ image/tiff-fx, image/gif, or image/svg+xml.
837838
- contentType: [image/jpeg, image/png, image/tiff, image/tiff-fx, image/gif, image/svg+xml]
838839
```
839840

841+
##### `validLink <boolean>`
842+
843+
(since v0.9.8)
844+
845+
This rule interprets the value as a URL, parse then checks if it returns a valid HTTP response.
846+
847+
Example: The HTTP content type should be image/jpeg, image/png, image/tiff,
848+
image/tiff-fx, image/gif, or image/svg+xml.
849+
850+
```yaml
851+
- name: thumbnail
852+
path: oai:record/dc:identifier[@type='binary']
853+
rules:
854+
- validLink: true
855+
```
856+
857+
You can also add a timout parameter in millisecond (if you not set the default value is
858+
5000 ms i.e., 5 seconds). If the request to not retrieve results within this time limit, it
859+
breaks the connection and the check will return failure. Sometimes the response time is too
860+
long, and you would like to check several thousands of URLs, which otherwise would take a very
861+
long time.
862+
863+
Set timeout for 1 second:
864+
865+
```yaml
866+
- name: thumbnail
867+
path: oai:record/dc:identifier[@type='binary']
868+
rules:
869+
- validLink: true
870+
timout: 1000
871+
```
872+
840873
##### `unique <boolean>`
841874

842875
(since v0.9.0)

src/main/java/de/gwdg/metadataqa/api/configuration/schema/Rule.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import com.fasterxml.jackson.annotation.JsonGetter;
44
import com.fasterxml.jackson.annotation.JsonIgnore;
55
import com.fasterxml.jackson.annotation.JsonInclude;
6+
import org.apache.commons.lang3.StringUtils;
67

78
import java.io.Serializable;
89
import java.lang.reflect.Field;
@@ -58,6 +59,8 @@ public class Rule implements Serializable {
5859
private MQAFPattern mqafPattern;
5960
private Boolean alwaysCheckDependencies = Boolean.FALSE;
6061
private String valuePath;
62+
private int timeout;
63+
private Boolean validLink;
6164

6265
public String getId() {
6366
return id;
@@ -629,6 +632,19 @@ public Rule withValuePath(String valuePath) {
629632
return this;
630633
}
631634

635+
public Integer getTimeout() {
636+
return timeout;
637+
}
638+
639+
public void setTimeout(int timeout) {
640+
this.timeout = timeout;
641+
}
642+
643+
public Rule withTimeout(int timeout) {
644+
this.timeout = timeout;
645+
return this;
646+
}
647+
632648
@JsonGetter("alwaysCheckDependencies")
633649
public Boolean getAlwaysCheckDependencies() {
634650
return alwaysCheckDependencies;
@@ -643,6 +659,19 @@ public Rule withAlwaysCheckDependencies(Boolean alwaysCheckDependencies) {
643659
return this;
644660
}
645661

662+
public Boolean getValidLink() {
663+
return validLink;
664+
}
665+
666+
public void setValidLink(Boolean validLink) {
667+
this.validLink = validLink;
668+
}
669+
670+
public Rule withValidLink(Boolean validLink) {
671+
this.validLink = validLink;
672+
return this;
673+
}
674+
646675
@JsonIgnore
647676
public List<String> getRulenames() {
648677
List<String> excludeFromComparision = List.of("serialVersionUID", "id", "description",

src/main/java/de/gwdg/metadataqa/api/rule/singlefieldchecker/ContentTypeChecker.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@ public class ContentTypeChecker extends SingleFieldChecker {
1818
private static final Logger LOGGER = Logger.getLogger(ContentTypeChecker.class.getCanonicalName());
1919

2020
public static final String PREFIX = "contentType";
21-
protected List<String> fixedValues;
21+
protected List<String> expectedContentTypes;
22+
private ContentTypeExtractor contentTypeExtractor;
2223

23-
public ContentTypeChecker(DataElement field, List<String> contentType) {
24-
this(field, field.getLabel(), contentType);
24+
public ContentTypeChecker(DataElement field, List<String> expectedContentTypes) {
25+
this(field, field.getLabel(), expectedContentTypes, ContentTypeExtractor.DEFAULT_TIMEOUT);
2526
}
2627

27-
public ContentTypeChecker(DataElement field, String header, List<String> fixedValues) {
28+
public ContentTypeChecker(DataElement field, String header, List<String> expectedContentTypes, int timeout) {
2829
super(field, header + ":" + PREFIX);
29-
this.fixedValues = fixedValues;
30+
this.expectedContentTypes = expectedContentTypes;
31+
contentTypeExtractor = new ContentTypeExtractor(timeout);
3032
}
3133

3234
@Override
@@ -46,10 +48,10 @@ public void update(Selector selector, FieldCounter<RuleCheckerOutput> results, R
4648
instanceCount++;
4749
isNA = false;
4850
try {
49-
String contentType = ContentTypeExtractor.getContentType(instance.getValue());
51+
String contentType = contentTypeExtractor.getContentType(instance.getValue());
5052
if (isDebug())
5153
LOGGER.info(String.format("value: '%s' -> '%s'", instance.getValue(), contentType));
52-
if (contentType == null || !fixedValues.contains(contentType)) {
54+
if (contentType == null || !expectedContentTypes.contains(contentType)) {
5355
allPassed = false;
5456
if (countInstances())
5557
failureCount++;
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package de.gwdg.metadataqa.api.rule.singlefieldchecker;
2+
3+
import de.gwdg.metadataqa.api.counter.FieldCounter;
4+
import de.gwdg.metadataqa.api.json.DataElement;
5+
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
6+
import de.gwdg.metadataqa.api.model.selector.Selector;
7+
import de.gwdg.metadataqa.api.rule.RuleCheckerOutput;
8+
import de.gwdg.metadataqa.api.rule.RuleCheckingOutputStatus;
9+
import de.gwdg.metadataqa.api.rule.RuleCheckingOutputType;
10+
import de.gwdg.metadataqa.api.util.LinkValidator;
11+
12+
import java.io.IOException;
13+
import java.util.List;
14+
import java.util.logging.Logger;
15+
16+
public class LinkValidityChecker extends SingleFieldChecker {
17+
18+
private static final Logger LOGGER = Logger.getLogger(LinkValidityChecker.class.getCanonicalName());
19+
20+
public static final String PREFIX = "validLink";
21+
protected LinkValidator linkValidator;
22+
protected Boolean expectedValue;
23+
24+
public LinkValidityChecker(DataElement field, Boolean expectedValue) {
25+
this(field, field.getLabel(), expectedValue, LinkValidator.DEFAULT_TIMEOUT);
26+
}
27+
28+
public LinkValidityChecker(DataElement field, String header, Boolean expectedValue, int timeout) {
29+
super(field, header + ":" + PREFIX);
30+
this.expectedValue = expectedValue;
31+
linkValidator = new LinkValidator(timeout);
32+
}
33+
34+
@Override
35+
public void update(Selector selector, FieldCounter<RuleCheckerOutput> results, RuleCheckingOutputType outputType) {
36+
if (isDebug())
37+
LOGGER.info(this.getClass().getSimpleName() + " " + this.id);
38+
39+
var allPassed = true;
40+
var isNA = true;
41+
int instanceCount = 0;
42+
int failureCount = 0;
43+
List<XmlFieldInstance> instances = selector.get(field);
44+
if (instances != null && !instances.isEmpty()) {
45+
for (XmlFieldInstance instance : instances) {
46+
if (instance.hasValue()) {
47+
if (countInstances())
48+
instanceCount++;
49+
isNA = false;
50+
try {
51+
boolean isValid = linkValidator.isValid(instance.getValue());
52+
if (isDebug())
53+
LOGGER.info(String.format("value: '%s' -> '%s'", instance.getValue(), isValid));
54+
if (isValid != expectedValue) {
55+
allPassed = false;
56+
if (countInstances())
57+
failureCount++;
58+
}
59+
} catch (IOException e) {
60+
LOGGER.warning(String.format("%s: %s", e.getClass().getSimpleName(), e.getMessage()));
61+
allPassed = false;
62+
if (countInstances())
63+
failureCount++;
64+
}
65+
if (!countInstances() && !allPassed)
66+
break;
67+
}
68+
}
69+
}
70+
71+
addOutput(results, isNA, allPassed, outputType, instanceCount, failureCount);
72+
if (isDebug())
73+
LOGGER.info(this.getClass().getSimpleName() + " " + this.id + ") result: " + RuleCheckingOutputStatus.create(isNA, allPassed, isMandatory()));
74+
}
75+
}

src/main/java/de/gwdg/metadataqa/api/schema/SchemaUtils.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import de.gwdg.metadataqa.api.rule.singlefieldchecker.HasValueChecker;
1717
import de.gwdg.metadataqa.api.rule.singlefieldchecker.ImageDimensionChecker;
1818
import de.gwdg.metadataqa.api.rule.singlefieldchecker.LanguageTagChecker;
19+
import de.gwdg.metadataqa.api.rule.singlefieldchecker.LinkValidityChecker;
1920
import de.gwdg.metadataqa.api.rule.singlefieldchecker.MQAFPatternChecker;
2021
import de.gwdg.metadataqa.api.rule.singlefieldchecker.MaxCountChecker;
2122
import de.gwdg.metadataqa.api.rule.singlefieldchecker.MaxLengthChecker;
@@ -126,8 +127,19 @@ private static List<RuleChecker> processRule(Schema schema, DataElement dataElem
126127
if (rule.getMaxExclusive() != null)
127128
ruleCheckers.add(new NumericValueChecker(dataElement, rule.getMinInclusive(), MAX_EXCLUSIVE));
128129

129-
if (rule.getContentType() != null && !rule.getContentType().isEmpty())
130-
ruleCheckers.add(new ContentTypeChecker(dataElement, rule.getContentType()));
130+
if (rule.getContentType() != null && !rule.getContentType().isEmpty()) {
131+
ContentTypeChecker contentTypeChecker = rule.getTimeout() == null
132+
? new ContentTypeChecker(dataElement, rule.getContentType())
133+
: new ContentTypeChecker(dataElement, dataElement.getLabel(), rule.getContentType(), rule.getTimeout());
134+
ruleCheckers.add(contentTypeChecker);
135+
}
136+
137+
if (rule.getValidLink() != null) {
138+
LinkValidityChecker checker = rule.getTimeout() == null
139+
? new LinkValidityChecker(dataElement, rule.getValidLink())
140+
: new LinkValidityChecker(dataElement, dataElement.getLabel(), rule.getValidLink(), rule.getTimeout());
141+
ruleCheckers.add(checker);
142+
}
131143

132144
if (rule.getDimension() != null)
133145
ruleCheckers.add(new ImageDimensionChecker(dataElement, rule.getDimension()));

src/main/java/de/gwdg/metadataqa/api/util/ContentTypeExtractor.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,18 @@
99

1010
public class ContentTypeExtractor {
1111
private static final Logger LOGGER = Logger.getLogger(ContentTypeExtractor.class.getCanonicalName());
12-
private static int timeout = 5000;
12+
public static final int DEFAULT_TIMEOUT = 5000;
13+
private int timeout;
1314

14-
public static String getContentType(String url) throws IOException {
15+
public ContentTypeExtractor() {
16+
this.timeout = DEFAULT_TIMEOUT;
17+
}
18+
19+
public ContentTypeExtractor(int timeout) {
20+
this.timeout = timeout;
21+
}
22+
23+
public String getContentType(String url) throws IOException {
1524
String contentType = null;
1625
URL urlObj = new URL(url);
1726
HttpURLConnection urlConnection = (HttpURLConnection) urlObj.openConnection();
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package de.gwdg.metadataqa.api.util;
2+
3+
import java.io.IOException;
4+
import java.net.HttpURLConnection;
5+
import java.net.URL;
6+
import java.util.logging.Logger;
7+
8+
public class LinkValidator {
9+
private static final Logger LOGGER = Logger.getLogger(LinkValidator.class.getCanonicalName());
10+
public static final int DEFAULT_TIMEOUT = 5000;
11+
private int timeout;
12+
13+
public LinkValidator() {
14+
this(DEFAULT_TIMEOUT);
15+
}
16+
17+
public LinkValidator(int timeout) {
18+
this.timeout = timeout;
19+
}
20+
21+
public boolean isValid(String url) throws IOException {
22+
URL urlObj = new URL(url);
23+
HttpURLConnection urlConnection = (HttpURLConnection) urlObj.openConnection();
24+
25+
urlConnection.setConnectTimeout(timeout);
26+
urlConnection.setReadTimeout(timeout);
27+
urlConnection.connect();
28+
int responseCode = urlConnection.getResponseCode();
29+
if (responseCode == 200) {
30+
return true;
31+
} else if (responseCode == 301 || responseCode == 302 || responseCode == 303) {
32+
String location = urlConnection.getHeaderField("Location");
33+
return isValid(location);
34+
} else {
35+
LOGGER.warning(String.format("URL %s returns unhandled status code: %d.\n", url, responseCode));
36+
}
37+
return false;
38+
}
39+
40+
}

src/test/java/de/gwdg/metadataqa/api/rule/singlefieldchecker/ContentTypeCheckerTest.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public void prefix() {
3939
@Test
4040
public void success() {
4141
cache = (CsvSelector) SelectorFactory.getInstance(schema.getFormat(),
42-
"\"https://iiif.deutsche-digitale-bibliothek.de/image/2/ec863e48-7e20-4e9c-95fd-babd708b6eaf/full/full/0/default.jpg\"");
42+
"\"https://iiif.deutsche-digitale-bibliothek.de/image/2/d3/a3/d3a3a84c-1fb5-4390-9bbf-e14cd8b746f8/full/full/0/default.jpg\"");
4343
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
4444

4545
ContentTypeChecker checker = new ContentTypeChecker(
@@ -53,13 +53,10 @@ public void success() {
5353

5454
assertEquals(2, fieldCounter.size());
5555
assertEquals("name:contentType", checker.getHeaderWithoutId());
56-
// TODO
57-
/*
5856
Assert.assertEquals(RuleCheckingOutputStatus.PASSED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
5957
Assert.assertNull(fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getScore());
6058
Assert.assertEquals(0, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getInstanceCount());
6159
Assert.assertEquals(0, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getFailureCount());
62-
*/
6360
}
6461

6562
@Test

0 commit comments

Comments
 (0)