Decode XML metadata as UTF-8

author Nils Maier <maierman@web.de>

Sun, 27 May 2012 20:49:28 +0000 (22:49 +0200)

committer Nils Maier <maierman@web.de>

Sun, 27 May 2012 20:56:49 +0000 (22:56 +0200)
author Nils Maier <maierman@web.de>
Sun, 27 May 2012 20:49:28 +0000 (22:49 +0200)
committer Nils Maier <maierman@web.de>
Sun, 27 May 2012 20:56:49 +0000 (22:56 +0200)
diff --git a/src/obj.js b/src/obj.js

index 3432ac68dd6b3686c3f0813111945ee4153715b6..acc9e1284818ab79e29f54ede0e8acd4fa37e4cc 100644 (file)
--- a/src/obj.js
+++ b/src/obj.js
@@ -140,7 +140,12 @@ var Catalog = (function CatalogClosure() {
  
          if (isName(type) && isName(subtype) &&
              type.name === 'Metadata' && subtype.name === 'XML') {
-          metadata = stringToPDFString(bytesToString(stream.getBytes()));
+          // XXX: This should examine the charset the XML document defines,
+          // however since there are currently no real means to decode
+          // arbitrary charsets, let's just hope that the author of the PDF
+          // was reasonable enough to stick with the XML default charset,
+          // which is UTF-8.
+          metadata = stringToUTF8String(bytesToString(stream.getBytes()));
          }
        }
  
diff --git a/src/util.js b/src/util.js

index 90e6cee5d62737dac80205a61d29340f3d0c8b86..fe5d895e3312a9ac47b20a58b027d2873acf4dd8 100644 (file)
--- a/src/util.js
+++ b/src/util.js
@@ -302,6 +302,10 @@ function stringToPDFString(str) {
    return str2;
  }
  
+function stringToUTF8String(str) {
+  return decodeURIComponent(escape(str));
+}
+
  function isBool(v) {
    return typeof v == 'boolean';
  }
author	Nils Maier <maierman@web.de>
	Sun, 27 May 2012 20:49:28 +0000 (22:49 +0200)
committer	Nils Maier <maierman@web.de>
	Sun, 27 May 2012 20:56:49 +0000 (22:56 +0200)
src/obj.js		patch \| blob \| history
src/util.js		patch \| blob \| history