From: Julian Viereck Date: Sun, 8 Apr 2012 15:57:55 +0000 (-0700) Subject: Merge text search with current master X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=3c77291013e23d04d923f623e17ac30b46cc273e;p=pdf.js.git Merge text search with current master --- 3c77291013e23d04d923f623e17ac30b46cc273e diff --cc src/core.js index 5cb8b1d,15cd147..4bd2cb2 --- a/src/core.js +++ b/src/core.js @@@ -206,42 -214,14 +217,44 @@@ var Page = (function PageClosure() var pe = this.pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); - var IRQueue = {}; - return (this.IRQueue = pe.getIRQueue(content, resources, IRQueue, - dependency)); + + this.operatorList = pe.getOperatorList(content, resources, dependency); + this.stats.timeEnd('Build IR Queue'); + return this.operatorList; }, - extractTextContent: function pageExtractPageContent() { ++ extractTextContent: function Page_extractTextContent() { + if ('textContent' in this) { + // text content was extracted + return this.textContent; + } + + var handler = { + on: function nullHandlerOn() {}, + send: function nullHandlerSend() {} + }; + + var xref = this.xref; + var content = xref.fetchIfRef(this.content); + var resources = xref.fetchIfRef(this.resources); + if (isArray(content)) { + // fetching items + var i, n = content.length; + var streams = []; + for (i = 0; i < n; ++i) + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.reset(); + + var pe = new PartialEvaluator( + xref, handler, 'p' + this.pageNumber + '_'); + var text = pe.getTextContent(content, resources); + return (this.textContent = text); + }, + - ensureFonts: function pageEnsureFonts(fonts, callback) { + ensureFonts: function Page_ensureFonts(fonts, callback) { + this.stats.time('Font Loading'); // Convert the font names to the corresponding font obj. for (var i = 0, ii = fonts.length; i < ii; i++) { fonts[i] = this.objs.objs[fonts[i]].data; @@@ -770,15 -788,9 +821,15 @@@ var PDFDoc = (function PDFDocClosure() if (page.displayReadyPromise) page.displayReadyPromise.reject(data.error); else - throw data.error; + error(data.error); }, this); + messageHandler.on('text_extracted', function pdfTextExtracted(data) { + var index = data[0]; + if (this.textExtracted) + this.textExtracted(index); + }, this); + messageHandler.on('jpeg_decode', function(data, promise) { var imageData = data[0]; var components = data[1]; @@@ -843,13 -856,7 +895,13 @@@ return (this.pageCache[n] = page); }, - extractText: function pdfDocExtractExtractText() { ++ extractText: function PDFDoc_extractText() { + this.workerReadyPromise.then(function pdfDocStartRenderingThen() { + this.messageHandler.send('extract_text'); + }.bind(this)); + }, + - destroy: function pdfDocDestroy() { + destroy: function PDFDoc_destroy() { if (this.worker) this.worker.terminate(); diff --cc src/evaluator.js index ab057a2,350ab20..fd1c6d6 --- a/src/evaluator.js +++ b/src/evaluator.js @@@ -136,15 -149,11 +149,11 @@@ var PartialEvaluator = (function Partia var fontRes = resources.get('Font'); - // TODO: TOASK: Is it possible to get here? If so, what does - // args[0].name should be like??? assert(fontRes, 'fontRes not available'); - fontRes = xref.fetchIfRef(fontRes); - fontRef = fontRef || fontRes.get(fontName); - var font = xref.fetchIfRef(fontRef); + font = xref.fetchIfRef(font) || fontRes.get(fontName); assertWellFormed(isDict(font)); - if (!font.translated) { + if (!font.loadedName) { font.translated = self.translateFont(font, xref, resources, dependency); if (font.translated) { @@@ -450,77 -470,9 +470,74 @@@ } } - return { - fnArray: fnArray, - argsArray: argsArray - }; + return queue; }, + getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { + + var self = this; + var xref = this.xref; + + function handleSetFont(fontName, fontRef) { + var fontRes = resources.get('Font'); + + // TODO: TOASK: Is it possible to get here? If so, what does + // args[0].name should be like??? + assert(fontRes, 'fontRes not available'); + + fontRes = xref.fetchIfRef(fontRes); + fontRef = fontRef || fontRes.get(fontName); + var font = xref.fetchIfRef(fontRef), tra; + assertWellFormed(isDict(font)); + if (!font.translated) { + font.translated = self.translateFont(font, xref, resources); + } + return font; + } + + resources = xref.fetchIfRef(resources) || new Dict(); + + var parser = new Parser(new Lexer(stream), false); + var res = resources; + var args = [], obj; + + var text = ''; + var font = null; + while (!isEOF(obj = parser.getObj())) { + if (isCmd(obj)) { + var cmd = obj.cmd; + switch (cmd) { + case 'Tf': + font = handleSetFont(args[0].name); + break; + case 'TJ': + var items = args[0]; + for (var j = 0, jj = items.length; j < jj; j++) { + if (typeof items[j] === 'string') { + text += fontCharsToUnicode(items[j], + font.translated.properties); + } else if (items[j] < 0) { + // making all negative offsets a space - better to have + // a space in incorrect place than not have them at all + text += ' '; + } + } + break; + case 'Tj': + text += fontCharsToUnicode(args[0], font.translated.properties); + break; + } // switch + + args = []; + } else if (obj != null) { + assertWellFormed(args.length <= 33, 'Too many arguments'); + args.push(obj); + } + } + + return text; + }, + extractDataStructures: function partialEvaluatorExtractDataStructures(dict, baseDict, xref, properties) { diff --cc src/fonts.js index d236289,7fdab8f..db596a2 --- a/src/fonts.js +++ b/src/fonts.js @@@ -2965,9 -2460,8 +3190,9 @@@ var Font = (function FontClosure() } // MacRoman encoding address by re-encoding the cmap table - fontChar = glyphName in GlyphsUnicode ? - GlyphsUnicode[glyphName] : - this.glyphNameMap[glyphName]; ++ + fontCharCode = glyphName in this.glyphNameMap ? + this.glyphNameMap[glyphName] : GlyphsUnicode[glyphName]; break; default: warn('Unsupported font type: ' + this.type); diff --cc src/worker.js index 6ea49d6,42bd610..b75fc66 --- a/src/worker.js +++ b/src/worker.js @@@ -139,88 -155,6 +155,39 @@@ var WorkerMessageHandler = depFonts: Object.keys(fonts) }); }, this); + - handler.on('font', function wphSetupFont(data) { - var objId = data[0]; - var name = data[1]; - var file = data[2]; - var properties = data[3]; - - var font = { - name: name, - file: file, - properties: properties - }; - - // Some fonts don't have a file, e.g. the build in ones like Arial. - if (file) { - var fontFileDict = new Dict(); - fontFileDict.map = file.dict.map; - - var fontFile = new Stream(file.bytes, file.start, - file.end - file.start, fontFileDict); - - // Check if this is a FlateStream. Otherwise just use the created - // Stream one. This makes complex_ttf_font.pdf work. - var cmf = file.bytes[0]; - if ((cmf & 0x0f) == 0x08) { - font.file = new FlateStream(fontFile); - } else { - font.file = fontFile; - } - } - - var obj = new Font(font.name, font.file, font.properties); - - var str = ''; - var objData = obj.data; - if (objData) { - var length = objData.length; - for (var j = 0; j < length; ++j) - str += String.fromCharCode(objData[j]); - } - - obj.str = str; - - // Remove the data array form the font object, as it's not needed - // anymore as we sent over the ready str. - delete obj.data; - - handler.send('font_ready', [objId, obj]); - }); - + handler.on('extract_text', function wphExtractText() { - var numPages = pdfDoc.numPages; ++ var numPages = pdfModel.numPages; + var index = []; + var start = Date.now(); + + function indexPage(pageNum) { + if (pageNum > numPages) { + console.log('text indexing: time=%dms', Date.now() - start); + + handler.send('text_extracted', [index]); + return; + } + + var textContent = ''; - try { - var page = pdfDoc.getPage(pageNum); ++ // try { ++ var page = pdfModel.getPage(pageNum); + textContent = page.extractTextContent(); - } catch (e) { - // Skip errored pages - } ++ // } catch (e) { ++ // // Skip errored pages ++ // } + + index.push(textContent); + + // processing one page, interrupting thread to process + // other requests + setTimeout(function extractTextNextPage() { + indexPage(pageNum + 1); + }, 0); + } + + indexPage(1); + }); } }; diff --cc web/viewer.html index 09c7195,d275f77..9ec535d --- a/web/viewer.html +++ b/web/viewer.html @@@ -140,14 -139,14 +146,17 @@@ + - + -
Loading... 0%
+
+
Loading... 0%
+
+
diff --cc web/viewer.js index bc41e36,3587c96..91639d9 --- a/web/viewer.js +++ b/web/viewer.js @@@ -457,67 -553,29 +553,91 @@@ var PDFView = this.page = 1; } + if (PDFView.currentScale === kUnknownScale) { + // Scale was not initialized: invalid bookmark or scale was not specified. + // Setting the default one. + this.parseScale(kDefaultScale, true); + } + + this.metadata = null; + var metadata = pdf.catalog.metadata; + var info = this.documentInfo = pdf.info; + var pdfTitle; + + if (metadata) { + this.metadata = metadata = new PDFJS.Metadata(metadata); + + if (metadata.has('dc:title')) + pdfTitle = metadata.get('dc:title'); + } + + if (!pdfTitle && info && info['Title']) + pdfTitle = info['Title']; + + if (pdfTitle) + document.title = pdfTitle + ' - ' + document.title; ++ + // loosing pdf reference here, starting text indexing in 500ms + setTimeout((function loadStartTextExtraction() { + this.startTextExtraction(pdf); + }).bind(this), 500); + delete PDFView.extractedText; + }, + + startTextExtraction: function pdfViewStartTextExtraction(pdf) { + var searchResults = document.getElementById('searchResults'); + searchResults.textContent = ''; + + pdf.textExtracted = function pdfTextExtracted(index) { + PDFView.extractedText = index; + }; + pdf.extractText(); + }, + + search: function pdfViewStartSearch() { + function bindLink(link, pageNumber) { + link.href = '#' + pageNumber; + link.onclick = function searchBindLink() { + PDFView.page = pageNumber; + return false; + }; + } + + var searchResults = document.getElementById('searchResults'); + if (!('extractedText' in PDFView)) { + // not indexed yet, repeat in 1 second + searchResults.textContent = 'Searching...'; + setTimeout(this.search.bind(this), 1000); + return; + } + + var searchTermsInput = document.getElementById('searchTermsInput'); + searchResults.removeAttribute('hidden'); + searchResults.textContent = ''; + + var terms = searchTermsInput.value; + // simple search: removing spaces and hyphens, then scanning every + terms = terms.replace(/\s-/g, '').toLowerCase(); + var index = PDFView.extractedText; + var pageFound = false; + for (var i = 0, ii = index.length; i < ii; i++) { + var pageText = index[i].replace(/\s-/g, '').toLowerCase(); + var j = pageText.indexOf(terms); + if (j < 0) + continue; + + var pageNumber = i + 1; + var textSample = index[i].substr(j, 50); + var link = document.createElement('a'); + bindLink(link, pageNumber); + link.textContent = 'Page ' + pageNumber + ': ' + textSample; + searchResults.appendChild(link); + + pageFound = true; + } + if (!pageFound) { + searchResults.textContent = '(Not found)'; + } }, setHash: function pdfViewSetHash(hash) {