XML uses UTF-8 by default, which needs to be decoded to a Javascript
String prior to feeding it to the DOMParser.
In an ideal world, the XML would actually be analyzed and the specified
charset would be used, however that does not seem feasible unless JS
engines get iconv bindings.
Fixes GH-1692
if (isName(type) && isName(subtype) &&
type.name === 'Metadata' && subtype.name === 'XML') {
- metadata = stringToPDFString(bytesToString(stream.getBytes()));
+ // XXX: This should examine the charset the XML document defines,
+ // however since there are currently no real means to decode
+ // arbitrary charsets, let's just hope that the author of the PDF
+ // was reasonable enough to stick with the XML default charset,
+ // which is UTF-8.
+ metadata = stringToUTF8String(bytesToString(stream.getBytes()));
}
}
return str2;
}
+function stringToUTF8String(str) {
+ return decodeURIComponent(escape(str));
+}
+
function isBool(v) {
return typeof v == 'boolean';
}