Skip to content

Commit 4d4ec4f

Browse files
committed
add option to force input to be utf-8
1 parent 4228b7a commit 4d4ec4f

File tree

7 files changed

+16
-8
lines changed

7 files changed

+16
-8
lines changed

Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ bench:
3030

3131
check-am:
3232
@echo "ambiguous..."; ./parsley test/ambiguous.let test/ambiguous.html 2>&1 | diff test/ambiguous.json - && echo " success."
33+
@echo "unicode..."; ./parsley test/unicode.let test/unicode.html 2>&1 | diff test/unicode.json - && echo " success."
3334
@echo "contains..."; ./parsley test/contains.let test/contains.html 2>&1 | diff test/contains.json - && echo " success."
3435
@echo "math_ambiguity..."; ./parsley test/math_ambiguity.let test/math_ambiguity.html 2>&1 | diff test/math_ambiguity.json - && echo " success."
3536
@echo "content..."; ./parsley test/content.let test/content.html 2>&1 | diff test/content.json - && echo " success."

Makefile.in

+1
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,7 @@ bench:
804804

805805
check-am:
806806
@echo "ambiguous..."; ./parsley test/ambiguous.let test/ambiguous.html 2>&1 | diff test/ambiguous.json - && echo " success."
807+
@echo "unicode..."; ./parsley test/unicode.let test/unicode.html 2>&1 | diff test/unicode.json - && echo " success."
807808
@echo "contains..."; ./parsley test/contains.let test/contains.html 2>&1 | diff test/contains.json - && echo " success."
808809
@echo "math_ambiguity..."; ./parsley test/math_ambiguity.let test/math_ambiguity.html 2>&1 | diff test/math_ambiguity.json - && echo " success."
809810
@echo "content..."; ./parsley test/content.let test/content.html 2>&1 | diff test/content.json - && echo " success."

parsley.c

+6-4
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,18 @@ static parsedParsleyPtr parse_error(char* format, ...) {
9595
parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, int flags) {
9696
xmlSetGenericErrorFunc(NULL , parsleyXsltError);
9797
bool html = flags & PARSLEY_OPTIONS_HTML;
98+
char * encoding = flags & PARSLEY_OPTIONS_FORCE_UTF8 ? "UTF-8" : NULL;
9899
if(html) {
99100
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
100-
htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
101+
htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
101102
htmlFreeParserCtxt(htmlCtxt);
102103
if(html == NULL) return parse_error("Couldn't parse file: %s\n", file);
103104
parsedParsleyPtr out = parsley_parse_doc(parsley, html, flags);
104105
xmlFreeDoc(html);
105106
return out;
106107
} else {
107108
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
108-
xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
109+
xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
109110
xmlFreeParserCtxt(ctxt);
110111
if(xml == NULL) return parse_error("Couldn't parse file: %s\n", file);
111112
parsedParsleyPtr out = parsley_parse_doc(parsley, xml, flags);
@@ -117,17 +118,18 @@ parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, int flags) {
117118
parsedParsleyPtr parsley_parse_string(parsleyPtr parsley, char* string, size_t size, char* base_uri, int flags) {
118119
xmlSetGenericErrorFunc(NULL , parsleyXsltError);
119120
bool html = flags & PARSLEY_OPTIONS_HTML;
121+
char * encoding = flags & PARSLEY_OPTIONS_FORCE_UTF8 ? "UTF-8" : NULL;
120122
if(base_uri == NULL) base_uri = "http://parselets.com/in-memory-string";
121123
if(html) {
122124
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
123-
htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, base_uri, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
125+
htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, base_uri, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
124126
if(html == NULL) return parse_error("Couldn't parse string");
125127
parsedParsleyPtr out = parsley_parse_doc(parsley, html, flags);
126128
xmlFreeDoc(html);
127129
return out;
128130
} else {
129131
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
130-
xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, base_uri, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
132+
xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, base_uri, encoding, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
131133
if(xml == NULL) return parse_error("Couldn't parse string");
132134
parsedParsleyPtr out = parsley_parse_doc(parsley, xml, flags);
133135
xmlFreeDoc(xml);

parsley.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ enum {
6464
PARSLEY_OPTIONS_ALLOW_NET = 4,
6565
PARSLEY_OPTIONS_ALLOW_LOCAL = 8,
6666
PARSLEY_OPTIONS_COLLATE = 16,
67-
PARSLEY_OPTIONS_SGWRAP = 32
67+
PARSLEY_OPTIONS_SGWRAP = 32,
68+
PARSLEY_OPTIONS_FORCE_UTF8 = 64
6869
};
6970

7071
typedef parsley_context * contextPtr;

parsley_main.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ static struct argp_option options[] = {
4646
{"no-collate", 'N', 0, 0, "Don't collate array entries" },
4747
{"sg-wrap", 's', 0, 0, "Wrap text nodes for SelectorGadget compatibility" },
4848
{"user-agent", 'U', "USER_AGENT", 0, "Value of HTTP User-Agent header" },
49+
{"utf8", 'u', 0, 0, "Force input to be read as UTF-8" },
4950
{"no-net", 'z', 0, 0, "Disable ftp and http access for parselets" },
5051
{"no-filesystem", 'Z', 0, 0, "Disable filesystem access for parselets" },
5152
{ 0 }
@@ -62,6 +63,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
6263
case 'x':
6364
arguments->flags &= ~PARSLEY_OPTIONS_HTML;
6465
break;
66+
case 'u':
67+
arguments->flags |= PARSLEY_OPTIONS_FORCE_UTF8;
68+
break;
6569
case 'U':
6670
parsley_set_user_agent(arg);
6771
case 'n':
@@ -121,7 +125,7 @@ int main (int argc, char **argv) {
121125
struct list_elem *elemptr = &elem;
122126
elem.has_next = 0;
123127
arguments.output_xml = 0;
124-
arguments.flags = ~0 & ~PARSLEY_OPTIONS_SGWRAP;
128+
arguments.flags = ~0 & ~PARSLEY_OPTIONS_SGWRAP & ~PARSLEY_OPTIONS_FORCE_UTF8;
125129
arguments.include_files = elemptr;
126130
arguments.output_file = "-";
127131
argp_parse (&argp, argc, argv, 0, 0, &arguments);

util.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ _parsley_set_user_agent(char * agent) {
6969

7070
static void *
7171
xmlUserAgentIOHTTPOpen(const char * file_name) {
72-
return(xmlNanoHTTPMethod(file_name, NULL, NULL, NULL, parsley_user_agent_header, 0));
72+
return (void *)(xmlNanoHTTPMethod(file_name, NULL, NULL, NULL, parsley_user_agent_header, 0));
7373
}
7474

7575
void

xml2json.c

-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ static struct json_object * _xml2json(xmlNodePtr xml) {
3131
}
3232
break;
3333
case XML_TEXT_NODE:
34-
// json_object_put(json);
3534
json = json_object_new_string(xml->content);
3635
break;
3736
}

0 commit comments

Comments
 (0)