From: notmasteryet Date: Thu, 15 Dec 2011 01:37:21 +0000 (-0600) Subject: Merge branch 'master' of git://github.com/mozilla/pdf.js.git into textsearch-1 X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=04551dbf578d599d22d2f54fe3bed3546126fcea;p=pdf.js.git Merge branch 'master' of git://github.com/mozilla/pdf.js.git into textsearch-1 Conflicts: src/core.js --- 04551dbf578d599d22d2f54fe3bed3546126fcea diff --cc src/core.js index 6a932f1,a6effd2..b498401 --- a/src/core.js +++ b/src/core.js @@@ -646,12 -618,41 +650,47 @@@ var PDFDoc = (function PDFDocClosure() throw data.error; }, this); + messageHandler.on('text_extracted', function pdfDocError(data) { - var index = data.index; ++ var index = data[0]; + if (this.textExtracted) + this.textExtracted(index); + }, this); + + messageHandler.on('jpeg_decode', function(data, promise) { + var imageData = data[0]; + var components = data[1]; + if (components != 3 && components != 1) + error('Only 3 component or 1 component can be returned'); + + var img = new Image(); + img.onload = (function jpegImageLoaderOnload() { + var width = img.width; + var height = img.height; + var size = width * height; + var rgbaLength = size * 4; + var buf = new Uint8Array(size * components); + var tmpCanvas = new ScratchCanvas(width, height); + var tmpCtx = tmpCanvas.getContext('2d'); + tmpCtx.drawImage(img, 0, 0); + var data = tmpCtx.getImageData(0, 0, width, height).data; + + if (components == 3) { + for (var i = 0, j = 0; i < rgbaLength; i += 4, j += 3) { + buf[j] = data[i]; + buf[j + 1] = data[i + 1]; + buf[j + 2] = data[i + 2]; + } + } else if (components == 1) { + for (var i = 0, j = 0; i < rgbaLength; i += 4, j++) { + buf[j] = data[i]; + } + } + promise.resolve({ data: buf, width: width, height: height}); + }).bind(this); + var src = 'data:image/jpeg;base64,' + window.btoa(imageData); + img.src = src; + }); + setTimeout(function pdfDocFontReadySetTimeout() { messageHandler.send('doc', this.data); this.workerReadyPromise.resolve(true); diff --cc src/fonts.js index 3c65a1a,83ce4ab..6bbbaf0 --- a/src/fonts.js +++ b/src/fonts.js @@@ -2139,8 -2101,39 +2143,39 @@@ var Font = (function FontClosure() return rule; }, + get spaceWidth() { + // trying to estimate space character width + var possibleSpaceReplacements = ['space', 'minus', 'one', 'i']; + var width; + for (var i = 0, ii = possibleSpaceReplacements.length; i < ii; i++) { + var glyphName = possibleSpaceReplacements[i]; + // if possible, getting width by glyph name + if (glyphName in this.widths) { + width = this.widths[glyphName]; + break; + } + var glyphUnicode = GlyphsUnicode[glyphName]; + // finding the charcode via unicodeToCID map + var charcode = 0; + if (this.composite) + charcode = this.unicodeToCID[glyphUnicode]; + // ... via toUnicode map + if (!charcode && 'toUnicode' in this) + charcode = this.toUnicode.indexOf(glyphUnicode); + // setting it to unicode if negative or undefined + if (!(charcode > 0)) + charcode = glyphUnicode; + // trying to get width via charcode + width = this.widths[charcode]; + if (width) + break; // the non-zero width found + } + width = (width || this.defaultWidth) * this.widthMultiplier; + return shadow(this, 'spaceWidth', width); + }, + charToGlyph: function fonts_charToGlyph(charcode) { - var unicode, width, codeIRQueue; + var fontChar, width, codeIRQueue; var width = this.widths[charcode]; diff --cc src/worker.js index 3cc91d0,c18de65..dea6339 --- a/src/worker.js +++ b/src/worker.js @@@ -160,39 -195,6 +195,39 @@@ var WorkerMessageHandler = handler.send('font_ready', [objId, obj]); }); + + handler.on('extract_text', function wphExtractText() { + var numPages = pdfDoc.numPages; + var index = []; + var start = Date.now(); + + function indexPage(pageNum) { + if (pageNum > numPages) { + console.log('text indexing=: time=%dms', Date.now() - start); + - handler.send('text_extracted', { index: index }); ++ handler.send('text_extracted', [index]); + return; + } + + var textContent = ''; + try { + var page = pdfDoc.getPage(pageNum); + textContent = page.extractTextContent(); + } catch (e) { + // Skip errored pages + } + + index.push(textContent); + + // processing one page, interrupting thread to process + // other requests + setTimeout(function extractTextNextPage() { + indexPage(pageNum + 1); + }, 0); + } + + indexPage(1); + }); } };