Skip to content

Commit 267e7b6

Browse files
committed
Improve link validation
1 parent c9646f9 commit 267e7b6

File tree

8 files changed

+71
-22
lines changed

8 files changed

+71
-22
lines changed

pom.xml

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
<groupId>de.gwdg.metadataqa</groupId>
66
<artifactId>metadata-qa-api</artifactId>
77
<packaging>jar</packaging>
8-
<version>0.9.8</version>
8+
<version>0.9.9-SNAPSHOT</version>
99
<name>Metadata Quality Assurance Framework API</name>
1010
<description>
11-
A metadata quality assurance framework. It checks some metrics of
11+
A metadata quality assurance framework. It checks some metrics of
1212
metadata records, such as completeness, uniqueness, problem catalog.
1313
</description>
1414

@@ -41,15 +41,39 @@
4141
</scm>
4242

4343
<distributionManagement>
44+
<repository>
45+
<id>central</id>
46+
<url>https://central.sonatype.com</url>
47+
<snapshots>
48+
<enabled>false</enabled>
49+
</snapshots>
50+
</repository>
4451
<snapshotRepository>
45-
<id>ossrh</id>
46-
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
52+
<id>central-snapshot</id>
53+
<url>https://central.sonatype.com/repository/maven-snapshots</url>
54+
<snapshots>
55+
<enabled>true</enabled>
56+
<updatePolicy>always</updatePolicy>
57+
</snapshots>
58+
<releases>
59+
<enabled>false</enabled>
60+
</releases>
4761
</snapshotRepository>
62+
</distributionManagement>
63+
64+
<repositories>
4865
<repository>
49-
<id>ossrh</id>
50-
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
66+
<name>Central Portal Snapshots</name>
67+
<id>central-portal-snapshots</id>
68+
<url>https://central.sonatype.com/repository/maven-snapshots/</url>
69+
<releases>
70+
<enabled>false</enabled>
71+
</releases>
72+
<snapshots>
73+
<enabled>true</enabled>
74+
</snapshots>
5175
</repository>
52-
</distributionManagement>
76+
</repositories>
5377

5478
<properties>
5579
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

src/main/java/de/gwdg/metadataqa/api/rule/singlefieldchecker/LinkValidityChecker.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ public class LinkValidityChecker extends SingleFieldChecker {
2121
protected LinkValidator linkValidator;
2222
protected Boolean expectedValue;
2323

24+
/**
25+
* @param field The data element to check
26+
* @param expectedValue Is the link expected to be valid?
27+
*/
2428
public LinkValidityChecker(DataElement field, Boolean expectedValue) {
2529
this(field, field.getLabel(), expectedValue, LinkValidator.DEFAULT_TIMEOUT);
2630
}

src/main/java/de/gwdg/metadataqa/api/util/ContentTypeExtractor.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@ public String getContentType(String url) throws IOException {
3333
String rawContentType = urlConnection.getHeaderField("Content-Type");
3434
if (rawContentType != null && StringUtils.isNotBlank(rawContentType))
3535
contentType = rawContentType.replaceAll("; ?charset.*$", "");
36-
} else if (responseCode == 301 || responseCode == 302 || responseCode == 303) {
36+
} else if (responseCode == 301 // Moved Permanently
37+
|| responseCode == 302 // Found
38+
|| responseCode == 303 // See Other
39+
|| responseCode == 304 // Not Modified
40+
|| responseCode == 307 // Temporary Redirect
41+
|| responseCode == 308 // Permanent Redirect
42+
) {
3743
String location = urlConnection.getHeaderField("Location");
3844
return getContentType(location);
3945
} else {

src/main/java/de/gwdg/metadataqa/api/util/LinkValidator.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,21 @@ public boolean isValid(String url) throws IOException {
2828
int responseCode = urlConnection.getResponseCode();
2929
if (responseCode == 200) {
3030
return true;
31-
} else if (responseCode == 301 || responseCode == 302 || responseCode == 303) {
31+
} else if (responseCode == 301 // Moved Permanently
32+
|| responseCode == 302 // Found
33+
|| responseCode == 303 // See Other
34+
|| responseCode == 304 // Not Modified
35+
|| responseCode == 307 // Temporary Redirect
36+
|| responseCode == 308 // Permanent Redirect
37+
) {
3238
String location = urlConnection.getHeaderField("Location");
3339
return isValid(location);
40+
} else if (responseCode == 401 // Unauthorized
41+
|| responseCode == 402 // Payment Required
42+
|| responseCode == 403 // Forbidden
43+
|| responseCode == 407 // Proxy Authentication Required
44+
) {
45+
return true;
3446
} else {
3547
LOGGER.warning(String.format("URL %s returns unhandled status code: %d.\n", url, responseCode));
3648
}

src/test/java/de/gwdg/metadataqa/api/cli/VersionTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
public class VersionTest {
88

9-
private static final String EXPECTED_VERSION = "0.9.8";
9+
private static final String EXPECTED_VERSION = "0.9.9-SNAPSHOT";
1010

1111
@Test
1212
public void getVersion() {

src/test/java/de/gwdg/metadataqa/api/rule/singlefieldchecker/ContentTypeCheckerTest.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public void success() {
5353

5454
assertEquals(2, fieldCounter.size());
5555
assertEquals("name:contentType", checker.getHeaderWithoutId());
56-
Assert.assertEquals(RuleCheckingOutputStatus.PASSED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
56+
Assert.assertEquals(RuleCheckingOutputStatus.FAILED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
5757
Assert.assertNull(fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getScore());
5858
Assert.assertEquals(0, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getInstanceCount());
5959
Assert.assertEquals(0, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getFailureCount());
@@ -76,15 +76,18 @@ public void success_withCountInstance() {
7676

7777
assertEquals(2, fieldCounter.size());
7878
assertEquals("name:contentType", checker.getHeaderWithoutId());
79-
Assert.assertEquals(RuleCheckingOutputStatus.PASSED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
79+
Assert.assertEquals(RuleCheckingOutputStatus.FAILED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
8080
Assert.assertNull(fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getScore());
8181
Assert.assertEquals(1, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getInstanceCount());
82-
Assert.assertEquals(0, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getFailureCount());
82+
Assert.assertEquals(1, (int) fieldCounter.get(checker.getHeader(RuleCheckingOutputType.SCORE)).getFailureCount());
8383
}
8484

8585
@Test
8686
public void failure() {
87-
cache = (CsvSelector) SelectorFactory.getInstance(schema.getFormat(), "http://creativecommons.org/licenses/by-nc-sa/4.0/");
87+
cache = (CsvSelector) SelectorFactory.getInstance(schema.getFormat(),
88+
// "http://creativecommons.org/licenses/by-nc-sa/4.0/"
89+
"https://github.com/pkiraly/metadata-qa-api"
90+
);
8891
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
8992

9093
ContentTypeChecker checker = new ContentTypeChecker(schema.getPathByLabel("name"),
@@ -162,7 +165,9 @@ public void unaccessible() {
162165
@Test
163166
public void t301() {
164167
cache = (CsvSelector) SelectorFactory.getInstance(schema.getFormat(),
165-
"http://creativecommons.org/licenses/by-nc-sa/4.0/");
168+
// "http://creativecommons.org/licenses/by-nc-sa/4.0/"
169+
"https://github.com/pkiraly/metadata-qa-api"
170+
);
166171
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
167172

168173
ContentTypeChecker checker = new ContentTypeChecker(schema.getPathByLabel("name"),

src/test/java/de/gwdg/metadataqa/api/rule/singlefieldchecker/LinkValidityCheckerTest.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,14 +152,13 @@ public void unaccessible() {
152152
"http://vb.uni-wuerzburg.de/ub/books/36z1197_57156733/folio-std/unexisting.jpg");
153153
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
154154

155-
ContentTypeChecker checker = new ContentTypeChecker(schema.getPathByLabel("name"),
156-
Arrays.asList("image/jpeg", "image/png", "image/tiff", "image/tiff-fx", "image/gif", "image/svg+xml", "application/pdf"));
155+
LinkValidityChecker checker = new LinkValidityChecker(schema.getPathByLabel("name"), true);
157156

158157
FieldCounter<RuleCheckerOutput> fieldCounter = new FieldCounter<>();
159158
checker.update(cache, fieldCounter, RuleCheckingOutputType.BOTH);
160159

161160
assertEquals(2, fieldCounter.size());
162-
assertEquals("name:contentType", checker.getHeaderWithoutId());
161+
assertEquals("name:validLink", checker.getHeaderWithoutId());
163162
Assert.assertEquals(RuleCheckingOutputStatus.FAILED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
164163
}
165164

@@ -169,15 +168,14 @@ public void t301() {
169168
"http://creativecommons.org/licenses/by-nc-sa/4.0/");
170169
cache.setCsvReader(new CsvReader().setHeader( ((CsvAwareSchema) schema).getHeader() ));
171170

172-
ContentTypeChecker checker = new ContentTypeChecker(schema.getPathByLabel("name"),
173-
Arrays.asList("text/html"));
171+
LinkValidityChecker checker = new LinkValidityChecker(schema.getPathByLabel("name"), true);
174172

175173
FieldCounter<RuleCheckerOutput> fieldCounter = new FieldCounter<>();
176174

177175
checker.update(cache, fieldCounter, RuleCheckingOutputType.BOTH);
178176

179177
assertEquals(2, fieldCounter.size());
180-
assertEquals("name:contentType", checker.getHeaderWithoutId());
178+
assertEquals("name:validLink", checker.getHeaderWithoutId());
181179
assertEquals(RuleCheckingOutputStatus.PASSED, fieldCounter.get(checker.getHeader(RuleCheckingOutputType.STATUS)).getStatus());
182180
}
183181

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
url
22
http://vb.uni-wuerzburg.de/ub/books/36z1197_57156733/folio-std/DE-20__36_Z_1_246__0001__197_0001.jpg
3-
http://creativecommons.org/licenses/by-nc-sa/4.0/
3+
https://github.com/pkiraly/metadata-qa-api
44
https://proxy.europeana.eu/media/2024903/photography_ProvidedCHO_KU_Leuven_9990740370101488/823c786b11d2acab6b7fb0477b354a27?disposition=inline&recordApiUrl=https%3A%2F%2Fapi.europeana.eu%2Frecord

0 commit comments

Comments
 (0)