Recovering from misplaced/bad XRef

author notmasteryet <async.processingjs@yahoo.com>

Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)

committer notmasteryet <async.processingjs@yahoo.com>

Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)
author notmasteryet <async.processingjs@yahoo.com>
Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)
committer notmasteryet <async.processingjs@yahoo.com>
Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)
diff --git a/pdf.js b/pdf.js

index 6c66b84c43a41aa84aaa4d74b9fd4badeba4a5ad..d70885416797940be3d9bdc69f54cb7bafa00c52 100644 (file)
--- a/pdf.js
+++ b/pdf.js
@@ -3161,6 +3161,110 @@ var XRef = (function xRefXRef() {
          this.readXRef(prev);
        return streamParameters;
      },
+    indexObjects: function indexObjects() {
+      // Simple scan through the PDF content to find objects,
+      // trailers and XRef streams.
+      function readToken(data, offset) {
+        var token = '', ch = data[offset];
+        while (ch !== 13 && ch !== 10) {
+          if (++offset >= data.length)
+            break;
+          token += String.fromCharCode(ch);
+          ch = data[offset];
+        }
+        return token;
+      }
+      function skipUntil(data, offset, what) {
+        var length = what.length, dataLength = data.length;
+        var bytes = new Uint8Array(length);
+        var i, skipped = 0;
+        for (i = 0; i < length; i++)
+          bytes[i] = what.charCodeAt(i);
+        // finding byte sequence
+        while(offset < dataLength) {
+          var i = 0;
+          while (i < length && data[offset + i] == bytes[i])
+            ++i;
+          if (i >= length)
+            break; // sequnce found
+
+          offset++;
+          skipped++;
+        }
+        return skipped;
+      }
+      var stream = this.stream;
+      stream.pos = 0;
+      var buffer = stream.getBytes();
+      var position = 0, length = buffer.length;
+      var trailers = [], xrefStms = [];
+      var state = 0;
+      var currentToken;
+      while (position < length) {
+        var ch = buffer[position];
+        if (ch === 32 || ch === 9 || ch === 13 || ch === 10) {
+          ++position;
+          continue;
+        }
+        if (ch === 37) { // %-comment
+          do {
+            ++position;
+            ch = buffer[position];
+          } while (ch !== 13 && ch !== 10);
+          continue;
+        }
+        var token = readToken(buffer, position);
+        var m;
+        if (token === 'xref') {
+          position += skipUntil(buffer, position, 'trailer');
+          trailers.push(position);
+          position += skipUntil(buffer, position, 'startxref');
+        } else if ((m = /^(\d+)\s+(\d+)\s+obj\b/.exec(token))) {
+          this.entries[m[1]] = {
+            offset: position,
+            gen: m[2] | 0,
+            uncompressed: true
+          };
+
+          var contentLength = skipUntil(buffer, position, 'endobj') + 7;
+          var content = buffer.subarray(position, position + contentLength);
+
+          // checking XRef stream suspect
+          // (it shall have '/XRef' and next char is not a letter)
+          var xrefTagOffset = skipUntil(content, 0, '/XRef');
+          if (xrefTagOffset < contentLength &&
+              content[xrefTagOffset + 5] < 64) {
+            xrefStms.push(position);
+            this.xrefstms[position] = 1; // don't read it recursively
+          }
+
+          position += contentLength;
+        } else
+          position += token.length + 1;
+      }
+      // reading XRef streams
+      for (var i = 0; i < xrefStms.length; ++i) {
+          this.readXRef(xrefStms[i]);
+      }
+      // finding main trailer
+      for (var i = 0; i < trailers.length; ++i) {
+        stream.pos = trailers[i];
+        var parser = new Parser(new Lexer(stream), true);
+        var obj = parser.getObj();
+        if (!IsCmd(obj, 'trailer'))
+          continue;
+        // read the trailer dictionary
+        var dict;
+        if (!IsDict(dict = parser.getObj()))
+          continue;
+        // taking the first one with 'ID'
+        if (dict.has('ID'))
+          return dict;
+      }
+      // nothing helps
+      error('Invalid PDF structure');
+      return null;
+    },
      readXRef: function readXref(startXRef) {
        var stream = this.stream;
        stream.pos = startXRef;
@@ -3178,8 +3282,7 @@ var XRef = (function xRefXRef() {
          }
          return this.readXRefStream(obj);
        }
-      error('Invalid XRef');
-      return null;
+      return this.indexObjects();
      },
      getEntry: function xRefGetEntry(i) {
        var e = this.entries[i];
author	notmasteryet <async.processingjs@yahoo.com>
	Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)
committer	notmasteryet <async.processingjs@yahoo.com>
	Sat, 24 Sep 2011 01:50:21 +0000 (20:50 -0500)