Skip to content

Commit 3b52dd6

Browse files
committed
First commit
0 parents  commit 3b52dd6

File tree

8 files changed

+139
-0
lines changed

8 files changed

+139
-0
lines changed

.classpath

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry kind="src" path="src"/>
4+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
5+
<classpathentry kind="lib" path="lib/jsoup-1.8.3.jar"/>
6+
<classpathentry kind="lib" path="lib/java-json.jar"/>
7+
<classpathentry kind="output" path="bin"/>
8+
</classpath>

.gitignore

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
.metadata
2+
bin/
3+
tmp/
4+
*.tmp
5+
*.bak
6+
*.swp
7+
*~.nib
8+
local.properties
9+
.settings/
10+
.loadpath
11+
.recommenders
12+
13+
# Eclipse Core
14+
# .project
15+
16+
# External tool builders
17+
.externalToolBuilders/
18+
19+
# Locally stored "Eclipse launch configurations"
20+
*.launch
21+
22+
# PyDev specific (Python IDE for Eclipse)
23+
*.pydevproject
24+
25+
# CDT-specific (C/C++ Development Tooling)
26+
.cproject
27+
28+
# JDT-specific (Eclipse Java Development Tools)
29+
# .classpath
30+
31+
# Java annotation processor (APT)
32+
.factorypath
33+
34+
# PDT-specific (PHP Development Tools)
35+
.buildpath
36+
37+
# sbteclipse plugin
38+
.target
39+
40+
# Tern plugin
41+
.tern-project
42+
43+
# TeXlipse plugin
44+
.texlipse
45+
46+
# STS (Spring Tool Suite)
47+
.springBeans
48+
49+
# Code Recommenders
50+
.recommenders/

.project

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>crawler_ted_articles</name>
4+
<comment></comment>
5+
<projects>
6+
</projects>
7+
<buildSpec>
8+
<buildCommand>
9+
<name>org.eclipse.jdt.core.javabuilder</name>
10+
<arguments>
11+
</arguments>
12+
</buildCommand>
13+
</buildSpec>
14+
<natures>
15+
<nature>org.eclipse.jdt.core.javanature</nature>
16+
</natures>
17+
</projectDescription>

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Crawler TED articles

corpus/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This directory will contain the files of articles

lib/java-json.jar

82.7 KB
Binary file not shown.

lib/jsoup-1.8.3.jar

308 KB
Binary file not shown.
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package com.nadjib.crawler.ted;
2+
import java.io.FileWriter;
3+
import java.util.ArrayList;
4+
5+
import org.json.JSONObject;
6+
import org.jsoup.Jsoup;
7+
import org.jsoup.nodes.Document;
8+
import org.jsoup.select.Elements;
9+
10+
// https://github.com/jhy/jsoup
11+
// http://jsoup.org/
12+
// http://www.java2s.com/Code/Jar/j/Downloadjavajsonjar.htm
13+
14+
public class ListURLs {
15+
16+
public static final String ROOT_URL = "https://www.ted.com";
17+
18+
public static final int NB_PAGES = 5;
19+
// TODO Add all list of languages
20+
public static final String LANG = "AR";
21+
22+
public static void main(String[] args) {
23+
try {
24+
System.out.println("Begin");
25+
26+
ArrayList<String> urlsArray = new ArrayList<String>();
27+
28+
for (int i = 1; i <= NB_PAGES; i++) {
29+
30+
System.out.println("Download page N° " + i + " ...");
31+
32+
Document doc = Jsoup.connect(ROOT_URL + "/talks?language=" + LANG + "&page=" + i).get();
33+
Elements links = doc.select("[href~=^(?i)/talks/.*=" + LANG + "]");
34+
35+
for (int j = 1; j < links.size(); j++) {
36+
String link = links.get(j).attr("href");
37+
if (link != null) {
38+
link = link.replaceFirst("[\\?|&].*", "");
39+
40+
if (!urlsArray.contains(link))
41+
urlsArray.add(link);
42+
}
43+
}
44+
}
45+
46+
JSONObject jsonObject = new JSONObject();
47+
jsonObject.put("root_url", ROOT_URL);
48+
49+
jsonObject.put("child_urls", urlsArray);
50+
51+
FileWriter file = new FileWriter("corpus/_urls.json");
52+
file.write(jsonObject.toString(4));
53+
54+
file.close();
55+
56+
System.out.println("Finish.");
57+
58+
} catch (Exception e) {
59+
e.printStackTrace();
60+
}
61+
}
62+
}

0 commit comments

Comments
 (0)