if (isArray(content)) {
// fetching items
var i, n = content.length;
+ var streams = [];
for (i = 0; i < n; ++i)
- content[i] = xref.fetchIfRef(content[i]);
- content = new StreamsSequenceStream(content);
- }
+ streams.push(xref.fetchIfRef(content[i]));
+ content = new StreamsSequenceStream(streams);
+ } else if (isStream(content))
+ content.pos = 0;
var pe = this.pe = new PartialEvaluator(
xref, handler, 'p' + this.pageNumber + '_');
dependency));
},
+ extractTextContent: function pageExtractPageContent() {
+ if ('textContent' in this) {
+ // text content was extracted
+ return this.textContent;
+ }
+
+ var handler = {
+ on: function () {},
+ send: function() {}
+ };
+
+ var xref = this.xref;
+ var content = xref.fetchIfRef(this.content);
+ var resources = xref.fetchIfRef(this.resources);
+ if (isArray(content)) {
+ // fetching items
+ var i, n = content.length;
+ var streams = [];
+ for (i = 0; i < n; ++i)
+ streams.push(xref.fetchIfRef(content[i]));
+ content = new StreamsSequenceStream(streams);
+ } else if (isStream(content))
+ content.pos = 0;
+
+ var pe = new PartialEvaluator(
+ xref, handler, 'p' + this.pageNumber + '_');
+ var text = pe.getTextContent(content, resources);
+ return (this.textContent = text);
+ },
+
ensureFonts: function pageEnsureFonts(fonts, callback) {
// Convert the font names to the corresponding font obj.
for (var i = 0, ii = fonts.length; i < ii; i++) {
throw data.error;
}, this);
+ messageHandler.on('text_extracted', function pdfDocError(data) {
+ var index = data.index;
+ if (this.textExtracted)
+ this.textExtracted(index);
+ }, this);
+
setTimeout(function pdfDocFontReadySetTimeout() {
messageHandler.send('doc', this.data);
this.workerReadyPromise.resolve(true);
return (this.pageCache[n] = page);
},
+ extractText: function pdfDocExtractExtractText() {
+ this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
+ this.messageHandler.send('extract_text');
+ }.bind(this));
+ },
+
destroy: function pdfDocDestroy() {
if (this.worker)
this.worker.terminate();
fontRef = fontRef || fontRes.get(fontName);
var font = xref.fetchIfRef(fontRef);
assertWellFormed(isDict(font));
- if (!font.translated) {
+ if (!font.loadedName) {
font.translated = self.translateFont(font, xref, resources,
dependency);
if (font.translated) {
};
},
+ getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
+
+ var self = this;
+ var xref = this.xref;
+
+ function handleSetFont(fontName, fontRef) {
+ var fontRes = resources.get('Font');
+
+ // TODO: TOASK: Is it possible to get here? If so, what does
+ // args[0].name should be like???
+ assert(fontRes, 'fontRes not available');
+
+ fontRes = xref.fetchIfRef(fontRes);
+ fontRef = fontRef || fontRes.get(fontName);
+ var font = xref.fetchIfRef(fontRef), tra;
+ assertWellFormed(isDict(font));
+ if (!font.translated) {
+ font.translated = self.translateFont(font, xref, resources);
+ }
+ return font;
+ }
+
+ resources = xref.fetchIfRef(resources) || new Dict();
+
+ var parser = new Parser(new Lexer(stream), false);
+ var res = resources;
+ var args = [], obj;
+
+ var text = '';
+ var font = null;
+ while (!isEOF(obj = parser.getObj())) {
+ if (isCmd(obj)) {
+ var cmd = obj.cmd;
+ switch (cmd) {
+ case 'Tf':
+ font = handleSetFont(args[0].name);
+ break;
+ case 'TJ':
+ var items = args[0];
+ for (var j = 0, jj = items.length; j < jj; j++) {
+ if (typeof items[j] === 'string')
+ text += items[j];
+ }
+ break;
+ case 'Tj':
+ text += args[0];
+ break;
+ } // switch
+
+ args = [];
+ } else if (obj != null) {
+ assertWellFormed(args.length <= 33, 'Too many arguments');
+ args.push(obj);
+ }
+ }
+
+ return text;
+ },
+
extractDataStructures: function
partialEvaluatorExtractDataStructures(dict, baseDict,
xref, properties) {
if (type.name === 'Type3') {
properties.coded = true;
- var charProcs = xref.fetchIfRef(dict.get('CharProcs'));
- var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources;
- properties.resources = fontResources;
- properties.charProcIRQueues = {};
- for (var key in charProcs.map) {
- var glyphStream = xref.fetchIfRef(charProcs.map[key]);
- var queueObj = {};
- properties.charProcIRQueues[key] =
- this.getIRQueue(glyphStream, fontResources, queueObj, dependency);
+ // read char procs only if dependency is specified
+ if (dependency) {
+ var charProcs = xref.fetchIfRef(dict.get('CharProcs'));
+ var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources;
+ properties.resources = fontResources;
+ properties.charProcIRQueues = {};
+ for (var key in charProcs.map) {
+ var glyphStream = xref.fetchIfRef(charProcs.map[key]);
+ var queueObj = {};
+ properties.charProcIRQueues[key] =
+ this.getIRQueue(glyphStream, fontResources, queueObj, dependency);
+ }
}
}
handler.send('font_ready', [objId, obj]);
});
+
+ handler.on('extract_text', function wphExtractText() {
+ var numPages = pdfDoc.numPages;
+ var index = [];
+ for (var i = 0; i < numPages; i++) {
+ var start = Date.now();
+
+ var textContent = '';
+ try {
+ var page = pdfDoc.getPage(i + 1);
+ textContent = page.extractTextContent();
+ } catch (e) {
+ // Skip errored pages
+ }
+
+ index.push(textContent);
+ }
+
+ console.log('text indexing=: time=%dms', Date.now() - start);
+
+ handler.send('text_extracted', { index: index });
+ });
}
};
}
else
this.page = 1;
+
+ setTimeout((function loadStartTextExtraction() {
+ this.startTextExtraction(pdf);
+ }).bind(this), 500);
+ },
+
+ startTextExtraction: function(pdf) {
+ pdf.textExtracted = function pdfTextExtracted(index) {
+ console.log(index.join());
+ };
+ pdf.extractText();
},
setHash: function pdfViewSetHash(hash) {