var pe = this.pe = new PartialEvaluator(
xref, handler, 'p' + this.pageNumber + '_');
- var IRQueue = {};
- return (this.IRQueue = pe.getIRQueue(content, resources, IRQueue,
- dependency));
+
+ this.operatorList = pe.getOperatorList(content, resources, dependency);
+ this.stats.timeEnd('Build IR Queue');
+ return this.operatorList;
},
- extractTextContent: function pageExtractPageContent() {
++ extractTextContent: function Page_extractTextContent() {
+ if ('textContent' in this) {
+ // text content was extracted
+ return this.textContent;
+ }
+
+ var handler = {
+ on: function nullHandlerOn() {},
+ send: function nullHandlerSend() {}
+ };
+
+ var xref = this.xref;
+ var content = xref.fetchIfRef(this.content);
+ var resources = xref.fetchIfRef(this.resources);
+ if (isArray(content)) {
+ // fetching items
+ var i, n = content.length;
+ var streams = [];
+ for (i = 0; i < n; ++i)
+ streams.push(xref.fetchIfRef(content[i]));
+ content = new StreamsSequenceStream(streams);
+ } else if (isStream(content))
+ content.reset();
+
+ var pe = new PartialEvaluator(
+ xref, handler, 'p' + this.pageNumber + '_');
+ var text = pe.getTextContent(content, resources);
+ return (this.textContent = text);
+ },
+
- ensureFonts: function pageEnsureFonts(fonts, callback) {
+ ensureFonts: function Page_ensureFonts(fonts, callback) {
+ this.stats.time('Font Loading');
// Convert the font names to the corresponding font obj.
for (var i = 0, ii = fonts.length; i < ii; i++) {
fonts[i] = this.objs.objs[fonts[i]].data;
if (page.displayReadyPromise)
page.displayReadyPromise.reject(data.error);
else
- throw data.error;
+ error(data.error);
}, this);
+ messageHandler.on('text_extracted', function pdfTextExtracted(data) {
+ var index = data[0];
+ if (this.textExtracted)
+ this.textExtracted(index);
+ }, this);
+
messageHandler.on('jpeg_decode', function(data, promise) {
var imageData = data[0];
var components = data[1];
return (this.pageCache[n] = page);
},
- extractText: function pdfDocExtractExtractText() {
++ extractText: function PDFDoc_extractText() {
+ this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
+ this.messageHandler.send('extract_text');
+ }.bind(this));
+ },
+
- destroy: function pdfDocDestroy() {
+ destroy: function PDFDoc_destroy() {
if (this.worker)
this.worker.terminate();
var fontRes = resources.get('Font');
- // TODO: TOASK: Is it possible to get here? If so, what does
- // args[0].name should be like???
assert(fontRes, 'fontRes not available');
- fontRes = xref.fetchIfRef(fontRes);
- fontRef = fontRef || fontRes.get(fontName);
- var font = xref.fetchIfRef(fontRef);
+ font = xref.fetchIfRef(font) || fontRes.get(fontName);
assertWellFormed(isDict(font));
- if (!font.translated) {
+ if (!font.loadedName) {
font.translated = self.translateFont(font, xref, resources,
dependency);
if (font.translated) {
}
}
- return {
- fnArray: fnArray,
- argsArray: argsArray
- };
+ return queue;
},
+ getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
+
+ var self = this;
+ var xref = this.xref;
+
+ function handleSetFont(fontName, fontRef) {
+ var fontRes = resources.get('Font');
+
+ // TODO: TOASK: Is it possible to get here? If so, what does
+ // args[0].name should be like???
+ assert(fontRes, 'fontRes not available');
+
+ fontRes = xref.fetchIfRef(fontRes);
+ fontRef = fontRef || fontRes.get(fontName);
+ var font = xref.fetchIfRef(fontRef), tra;
+ assertWellFormed(isDict(font));
+ if (!font.translated) {
+ font.translated = self.translateFont(font, xref, resources);
+ }
+ return font;
+ }
+
+ resources = xref.fetchIfRef(resources) || new Dict();
+
+ var parser = new Parser(new Lexer(stream), false);
+ var res = resources;
+ var args = [], obj;
+
+ var text = '';
+ var font = null;
+ while (!isEOF(obj = parser.getObj())) {
+ if (isCmd(obj)) {
+ var cmd = obj.cmd;
+ switch (cmd) {
+ case 'Tf':
+ font = handleSetFont(args[0].name);
+ break;
+ case 'TJ':
+ var items = args[0];
+ for (var j = 0, jj = items.length; j < jj; j++) {
+ if (typeof items[j] === 'string') {
+ text += fontCharsToUnicode(items[j],
+ font.translated.properties);
+ } else if (items[j] < 0) {
+ // making all negative offsets a space - better to have
+ // a space in incorrect place than not have them at all
+ text += ' ';
+ }
+ }
+ break;
+ case 'Tj':
+ text += fontCharsToUnicode(args[0], font.translated.properties);
+ break;
+ } // switch
+
+ args = [];
+ } else if (obj != null) {
+ assertWellFormed(args.length <= 33, 'Too many arguments');
+ args.push(obj);
+ }
+ }
+
+ return text;
+ },
+
extractDataStructures: function
partialEvaluatorExtractDataStructures(dict, baseDict,
xref, properties) {
}
// MacRoman encoding address by re-encoding the cmap table
- fontChar = glyphName in GlyphsUnicode ?
- GlyphsUnicode[glyphName] :
- this.glyphNameMap[glyphName];
++
+ fontCharCode = glyphName in this.glyphNameMap ?
+ this.glyphNameMap[glyphName] : GlyphsUnicode[glyphName];
break;
default:
warn('Unsupported font type: ' + this.type);
depFonts: Object.keys(fonts)
});
}, this);
- handler.on('font', function wphSetupFont(data) {
- var objId = data[0];
- var name = data[1];
- var file = data[2];
- var properties = data[3];
-
- var font = {
- name: name,
- file: file,
- properties: properties
- };
-
- // Some fonts don't have a file, e.g. the build in ones like Arial.
- if (file) {
- var fontFileDict = new Dict();
- fontFileDict.map = file.dict.map;
-
- var fontFile = new Stream(file.bytes, file.start,
- file.end - file.start, fontFileDict);
-
- // Check if this is a FlateStream. Otherwise just use the created
- // Stream one. This makes complex_ttf_font.pdf work.
- var cmf = file.bytes[0];
- if ((cmf & 0x0f) == 0x08) {
- font.file = new FlateStream(fontFile);
- } else {
- font.file = fontFile;
- }
- }
-
- var obj = new Font(font.name, font.file, font.properties);
-
- var str = '';
- var objData = obj.data;
- if (objData) {
- var length = objData.length;
- for (var j = 0; j < length; ++j)
- str += String.fromCharCode(objData[j]);
- }
-
- obj.str = str;
-
- // Remove the data array form the font object, as it's not needed
- // anymore as we sent over the ready str.
- delete obj.data;
-
- handler.send('font_ready', [objId, obj]);
- });
-
+
- var numPages = pdfDoc.numPages;
+ handler.on('extract_text', function wphExtractText() {
- try {
- var page = pdfDoc.getPage(pageNum);
++ var numPages = pdfModel.numPages;
+ var index = [];
+ var start = Date.now();
+
+ function indexPage(pageNum) {
+ if (pageNum > numPages) {
+ console.log('text indexing: time=%dms', Date.now() - start);
+
+ handler.send('text_extracted', [index]);
+ return;
+ }
+
+ var textContent = '';
- } catch (e) {
- // Skip errored pages
- }
++ // try {
++ var page = pdfModel.getPage(pageNum);
+ textContent = page.extractTextContent();
++ // } catch (e) {
++ // // Skip errored pages
++ // }
+
+ index.push(textContent);
+
+ // processing one page, interrupting thread to process
+ // other requests
+ setTimeout(function extractTextNextPage() {
+ indexPage(pageNum + 1);
+ }, 0);
+ }
+
+ indexPage(1);
+ });
}
};
<button id="outlineSwitch" title="Show Document Outline" onclick="PDFView.switchSidebarView('outline')" disabled>
<img src="images/nav-outline.svg" align="top" height="16" alt="Document Outline" />
</button>
+ <button id="searchSwitch" title="Show Search" onclick="PDFView.switchSidebarView('search')">
+ <img src="images/edit-find.svg" align="top" height="16" alt="Search Document" />
+ </button>
</div>
- </div>
+ </div>
</div>
- <div id="loading">Loading... 0%</div>
+ <div id="loadingBox">
+ <div id="loading">Loading... 0%</div>
+ <div id="loadingBar"><div class="progress"></div></div>
+ </div>
<div id="viewer"></div>
</body>
</html>
this.page = 1;
}
+ if (PDFView.currentScale === kUnknownScale) {
+ // Scale was not initialized: invalid bookmark or scale was not specified.
+ // Setting the default one.
+ this.parseScale(kDefaultScale, true);
+ }
+
+ this.metadata = null;
+ var metadata = pdf.catalog.metadata;
+ var info = this.documentInfo = pdf.info;
+ var pdfTitle;
+
+ if (metadata) {
+ this.metadata = metadata = new PDFJS.Metadata(metadata);
+
+ if (metadata.has('dc:title'))
+ pdfTitle = metadata.get('dc:title');
+ }
+
+ if (!pdfTitle && info && info['Title'])
+ pdfTitle = info['Title'];
+
+ if (pdfTitle)
+ document.title = pdfTitle + ' - ' + document.title;
++
+ // loosing pdf reference here, starting text indexing in 500ms
+ setTimeout((function loadStartTextExtraction() {
+ this.startTextExtraction(pdf);
+ }).bind(this), 500);
+ delete PDFView.extractedText;
+ },
+
+ startTextExtraction: function pdfViewStartTextExtraction(pdf) {
+ var searchResults = document.getElementById('searchResults');
+ searchResults.textContent = '';
+
+ pdf.textExtracted = function pdfTextExtracted(index) {
+ PDFView.extractedText = index;
+ };
+ pdf.extractText();
+ },
+
+ search: function pdfViewStartSearch() {
+ function bindLink(link, pageNumber) {
+ link.href = '#' + pageNumber;
+ link.onclick = function searchBindLink() {
+ PDFView.page = pageNumber;
+ return false;
+ };
+ }
+
+ var searchResults = document.getElementById('searchResults');
+ if (!('extractedText' in PDFView)) {
+ // not indexed yet, repeat in 1 second
+ searchResults.textContent = 'Searching...';
+ setTimeout(this.search.bind(this), 1000);
+ return;
+ }
+
+ var searchTermsInput = document.getElementById('searchTermsInput');
+ searchResults.removeAttribute('hidden');
+ searchResults.textContent = '';
+
+ var terms = searchTermsInput.value;
+ // simple search: removing spaces and hyphens, then scanning every
+ terms = terms.replace(/\s-/g, '').toLowerCase();
+ var index = PDFView.extractedText;
+ var pageFound = false;
+ for (var i = 0, ii = index.length; i < ii; i++) {
+ var pageText = index[i].replace(/\s-/g, '').toLowerCase();
+ var j = pageText.indexOf(terms);
+ if (j < 0)
+ continue;
+
+ var pageNumber = i + 1;
+ var textSample = index[i].substr(j, 50);
+ var link = document.createElement('a');
+ bindLink(link, pageNumber);
+ link.textContent = 'Page ' + pageNumber + ': ' + textSample;
+ searchResults.appendChild(link);
+
+ pageFound = true;
+ }
+ if (!pageFound) {
+ searchResults.textContent = '(Not found)';
+ }
},
setHash: function pdfViewSetHash(hash) {