Skip to content

Commit 601c640

Browse files
committed
[CNKI] Clean up data
1 parent e622c8d commit 601c640

File tree

1 file changed

+98
-25
lines changed

1 file changed

+98
-25
lines changed

CNKI.js

+98-25
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"inRepository": true,
1010
"translatorType": 4,
1111
"browserSupport": "gcs",
12-
"lastUpdated": "2013-08-25 02:58:03"
12+
"lastUpdated": "2013-08-25 04:10:34"
1313
}
1414

1515
/*
@@ -43,7 +43,21 @@ function getRefworksByID(ids, next) {
4343
ZU.doPost(
4444
'http://epub.cnki.net/KNS/ViewPage/SaveSelectedNoteFormat.aspx?type=txt',
4545
'CurSaveModeType=REFWORKS',
46-
next
46+
function(text) {
47+
//fix item types
48+
text = text.replace(/^RT\s+Dissertation\/Thesis/gmi, 'RT Dissertation')
49+
//Zotero doesn't do well with mixed line endings. Make everything \n
50+
.replace(/\r\n?/g, '\n')
51+
//split authors
52+
.replace(/^(A[1-4]|U2)\s*([^\r\n]+)/gm, function(m, tag, authors) {
53+
var authors = authors.split(/\s*[;,]\s*/); //that's a special comma
54+
if(!authors[authors.length-1].trim()) authors.pop();
55+
56+
return tag + ' ' + authors.join('\n' + tag + ' ');
57+
});
58+
59+
next(text);
60+
}
4761
);
4862
});
4963
}
@@ -58,10 +72,16 @@ function getIDFromURL(url) {
5872
return {dbname: dbname[1], filename: filename[1], url: url};
5973
}
6074

75+
function getIDFromPage(doc, url) {
76+
return getIDFromURL(url)
77+
|| getIDFromURL(ZU.xpathText(doc, '//div[@class="zwjdown"]/a/@href'));
78+
}
79+
6180
function getTypeFromDBName(dbname) {
6281
switch(dbname.substr(0,4).toUpperCase()) {
6382
case "CJFQ":
6483
case "CJFD":
84+
case "CAPJ":
6585
return "journalArticle";
6686
case "CDFD":
6787
case "CMFD":
@@ -76,27 +96,40 @@ function getTypeFromDBName(dbname) {
7696
}
7797
}
7898

79-
function getItemsFromSearchResults(doc, url) {
80-
var links = doc.getElementsByClassName('fz14');
99+
function getItemsFromSearchResults(doc, url, itemInfo) {
100+
var links = ZU.xpath(doc, '//tr[not(.//tr) and .//a[@class="fz14"]]');
101+
var aXpath = './/a[@class="fz14"]';
102+
if(!links.length) {
103+
links = ZU.xpath(doc, '//table[@class="GridTableContent"]/tbody/tr[./td[2]/a]');
104+
aXpath = './td[2]/a';
105+
}
81106
if(!links.length) return;
82107

83108
var items = {};
84109
var count = 0;
85110
for(var i=0, n=links.length; i<n; i++) {
86-
var title = ZU.xpathText(links[i], './node()[not(name()="SCRIPT")]', null, '');
111+
var a = ZU.xpath(links[i], aXpath)[0];
112+
var title = ZU.xpathText(a, './node()[not(name()="SCRIPT")]', null, '');
87113
if(title) title = ZU.trimInternal(title);
88-
var id = getIDFromURL(links[i].href);
114+
var id = getIDFromURL(a.href);
89115
if(!title || !id) continue;
90116

91117
count++;
92-
items[links[i].href] = title;
118+
if(itemInfo) {
119+
itemInfo[a.href] = {id: id};
120+
121+
/*var pdfLink = ZU.xpath(links[i], './/a[@class="brief_downloadIcon"]')[0];
122+
if(pdfLink) itemInfo[a.href].pdfURL = pdfLink.href;*/
123+
}
124+
items[a.href] = title;
93125
}
94126

95127
if(count) return items;
96128
}
97129

98130
function detectWeb(doc, url) {
99-
var id = getIDFromURL(url);
131+
var id = getIDFromPage(doc, url);
132+
Z.debug(id);
100133
if(id) {
101134
return getTypeFromDBName(id.dbname);
102135
}
@@ -107,41 +140,81 @@ function detectWeb(doc, url) {
107140

108141
function doWeb(doc, url) {
109142
if(detectWeb(doc, url) == "multiple") {
110-
var items = getItemsFromSearchResults(doc, url);
143+
var itemInfo = {};
144+
var items = getItemsFromSearchResults(doc, url, itemInfo);
111145
Z.selectItems(items, function(selectedItems) {
112146
if(!selectedItems) return true;
113147

148+
var itemInfoByTitle = {};
114149
var ids = [];
115150
for(var url in selectedItems) {
116-
ids.push(getIDFromURL(url));
151+
ids.push(itemInfo[url].id);
152+
itemInfoByTitle[selectedItems[url]] = itemInfo[url];
153+
itemInfoByTitle[selectedItems[url]].url = url;
117154
}
118-
scrape(ids);
155+
scrape(ids, doc, url, itemInfoByTitle);
119156
});
120157
} else {
121-
scrape([getIDFromURL(url)]);
158+
scrape([getIDFromPage(doc, url)], doc, url);
122159
}
123160
}
124161

125-
function scrape(ids) {
162+
function scrape(ids, doc, url, itemInfo) {
126163
getRefworksByID(ids, function(text) {
127-
Z.debug(text);
128-
//fix item types
129-
text = text.replace(/RT\s+Dissertation\/Thesis/mi, 'RT Dissertation')
130-
//split authors
131-
.replace(/^(A[1-4]|U2)\s*([^\r\n]+)/m, function(m, tag, authors) {
132-
var authors = authors.split(';');
133-
if(!authors[authors.length-1].trim()) authors.pop();
134-
135-
return tag + ' ' + authors.join('\n' + tag + ' ');
136-
})
137-
164+
Z.debug(text);
138165
var translator = Z.loadTranslator('import');
139166
translator.setTranslator('1a3506da-a303-4b0a-a1cd-f216e6138d86'); //Refworks
140167
translator.setString(text);
141168

142169
var i = 0;
143170
translator.setHandler('itemDone', function(obj, newItem) {
144-
newItem.url = ids[i].url;
171+
//split names
172+
for(var i=0, n=newItem.creators.length; i<n; i++) {
173+
var creator = newItem.creators[i];
174+
if(creator.firstName) continue;
175+
176+
var lastSpace = creator.lastName.lastIndexOf(' ');
177+
if(creator.lastName.search(/[A-Za-z]/) !== -1 && lastSpace !== -1) {
178+
//western name. split on last space
179+
creator.firstName = creator.lastName.substr(0,lastSpace);
180+
creator.lastName = creator.lastName.substr(lastSpace+1);
181+
} else {
182+
//Chinese name. first character is last name, the rest are first name
183+
creator.firstName = creator.lastName.substr(1);
184+
creator.lastName = creator.lastName.charAt(0);
185+
}
186+
}
187+
188+
if(newItem.abstractNote) {
189+
newItem.abstractNote = newItem.abstractNote.replace(/\s*[\r\n]\s*/g, '\n');
190+
}
191+
192+
//clean up tags. Remove numbers from end
193+
for(var i=0, n=newItem.tags.length; i<n; i++) {
194+
newItem.tags[i] = newItem.tags[i].replace(/:\d+$/, '');
195+
}
196+
197+
newItem.title = ZU.trimInternal(newItem.title);
198+
if(itemInfo) {
199+
var info = itemInfo[newItem.title];
200+
if(!info) {
201+
Z.debug('No item info for "' + newItem.title + '"');
202+
} else {
203+
/*if(!info.pdfURL) {
204+
Z.debug('No PDF URL passed from multiples page');
205+
} else {
206+
newItem.attachments.push({
207+
title: 'Full Text PDF',
208+
mimeType: 'application/pdf',
209+
url: info.pdfURL
210+
})
211+
}*/
212+
213+
newItem.url = info.url;
214+
}
215+
} else {
216+
newItem.url = url;
217+
}
145218

146219
i++;
147220
newItem.complete();

0 commit comments

Comments
 (0)