From: Brendan Dahl Date: Fri, 20 Apr 2012 23:49:08 +0000 (-0700) Subject: Merge new API and text search. X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=92d0d1d69428bb46992ac3505bd8da3259ee3474;p=pdf.js.git Merge new API and text search. --- 92d0d1d69428bb46992ac3505bd8da3259ee3474 diff --cc src/api.js index 3d97dac,0000000..dee6c66 mode 100644,000000..100644 --- a/src/api.js +++ b/src/api.js @@@ -1,590 -1,0 +1,596 @@@ +/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ + +/** + * This is the main entry point for loading a PDF and interacting with it. + * NOTE: If a URL is used to fetch the PDF data a standard XMLHttpRequest(XHR) + * is used, which means it must follow the same origin rules that any XHR does + * e.g. No cross domain requests without CORS. + * + * @param {string|TypedAray} source Either a url to a PDF is located or a + * typed array already populated with data. + * @return {Promise} A promise that is resolved with {PDFDocumentProxy} object. + */ +PDFJS.getDocument = function getDocument(source) { + var promise = new PDFJS.Promise(); + var transport = new WorkerTransport(promise); + if (typeof source === 'string') { + // fetch url + PDFJS.getPdf( + { + url: source, + progress: function getPDFProgress(evt) { + if (evt.lengthComputable) + promise.progress({ + loaded: evt.loaded, + total: evt.total + }); + }, + error: function getPDFError(e) { + promise.reject('Unexpected server response of ' + + e.target.status + '.'); + } + }, + function getPDFLoad(data) { + transport.sendData(data); + }); + } else { + // assuming the source is array, instantiating directly from it + transport.sendData(source); + } + return promise; +}; + +/** + * Proxy to a PDFDocument in the worker thread. Also, contains commonly used + * properties that can be read synchronously. + */ +var PDFDocumentProxy = (function() { + function PDFDocumentProxy(pdfInfo, transport) { + this.pdfInfo = pdfInfo; + this.transport = transport; + } + PDFDocumentProxy.prototype = { + /** + * @return {number} Total number of pages the PDF contains. + */ + get numPages() { + return this.pdfInfo.numPages; + }, + /** + * @return {string} A unique ID to identify a PDF. Not guaranteed to be + * unique. + */ + get fingerprint() { + return this.pdfInfo.fingerprint; + }, + /** + * @param {number} The page number to get. The first page is 1. + * @return {Promise} A promise that is resolved with a {PDFPageProxy} + * object. + */ + getPage: function(number) { + return this.transport.getPage(number); + }, + /** + * @return {Promise} A promise that is resolved with a lookup table for + * mapping named destinations to reference numbers. + */ + getDestinations: function() { + var promise = new PDFJS.Promise(); + var destinations = this.pdfInfo.destinations; + promise.resolve(destinations); + return promise; + }, + /** + * @return {Promise} A promise that is resolved with an {array} that is a + * tree outline (if it has one) of the PDF. The tree is in the format of: + * [ + * { + * title: string, + * bold: boolean, + * italic: boolean, + * color: rgb array, + * dest: dest obj, + * items: array of more items like this + * }, + * ... + * ]. + */ + getOutline: function() { + var promise = new PDFJS.Promise(); + var outline = this.pdfInfo.outline; + promise.resolve(outline); + return promise; + }, + /** + * @return {Promise} A promise that is resolved with an {object} that has + * info and metadata properties. Info is an {object} filled with anything + * available in the information dictionary and similarly metadata is a + * {Metadata} object with information from the metadata section of the PDF. + */ + getMetadata: function() { + var promise = new PDFJS.Promise(); + var info = this.pdfInfo.info; + var metadata = this.pdfInfo.metadata; + promise.resolve({ + info: info, + metadata: metadata ? new PDFJS.Metadata(metadata) : null + }); + return promise; + }, + destroy: function() { + this.transport.destroy(); + } + }; + return PDFDocumentProxy; +})(); + +var PDFPageProxy = (function PDFPageProxyClosure() { + function PDFPageProxy(pageInfo, transport) { + this.pageInfo = pageInfo; + this.transport = transport; + this.stats = new StatTimer(); + this.stats.enabled = !!globalScope.PDFJS.enableStats; + this.objs = transport.objs; + this.renderInProgress = false; + } + PDFPageProxy.prototype = { + /** + * @return {number} Page number of the page. First page is 1. + */ + get pageNumber() { + return this.pageInfo.pageIndex + 1; + }, + /** + * @return {number} The number of degrees the page is rotated clockwise. + */ + get rotate() { + return this.pageInfo.rotate; + }, + /** + * @return {object} The reference that points to this page. It has 'num' and + * 'gen' properties. + */ + get ref() { + return this.pageInfo.ref; + }, + /** + * @return {array} An array of the visible portion of the PDF page in the + * user space units - [x1, y1, x2, y2]. + */ + get view() { + return this.pageInfo.view; + }, + /** + * @param {number} scale The desired scale of the viewport. + * @param {number} rotate Degrees to rotate the viewport. If omitted this + * defaults to the page rotation. + * @return {PageViewport} Contains 'width' and 'height' properties along + * with transforms required for rendering. + */ + getViewport: function(scale, rotate) { + if (arguments.length < 2) + rotate = this.rotate; + return new PDFJS.PageViewport(this.view, scale, rotate, 0, 0); + }, + /** + * @return {Promise} A promise that is resolved with an {array} of the + * annotation objects. + */ + getAnnotations: function() { + if (this.annotationsPromise) + return this.annotationsPromise; + + var promise = new PDFJS.Promise(); + this.annotationsPromise = promise; + this.transport.getAnnotations(this.pageInfo.pageIndex); + return promise; + }, + /** + * Begins the process of rendering a page to the desired context. + * @param {object} params A parameter object that supports: + * { + * canvasContext(required): A 2D context of a DOM Canvas object., + * textLayer(optional): An object that has beginLayout, endLayout, and + * appendText functions. + * }. + * @return {Promise} A promise that is resolved when the page finishes + * rendering. + */ + render: function(params) { + this.renderInProgress = true; + + var promise = new Promise(); + var stats = this.stats; + stats.time('Overall'); + // If there is no displayReadyPromise yet, then the operatorList was never + // requested before. Make the request and create the promise. + if (!this.displayReadyPromise) { + this.displayReadyPromise = new Promise(); + this.destroyed = false; + + this.stats.time('Page Request'); + this.transport.messageHandler.send('RenderPageRequest', { + pageIndex: this.pageNumber - 1 + }); + } + + var self = this; + function complete(error) { + self.renderInProgress = false; + if (self.destroyed) { + delete self.operatorList; + delete self.displayReadyPromise; + } + + if (error) + promise.reject(error); + else + promise.resolve(); + }; + + // Once the operatorList and fonts are loaded, do the actual rendering. + this.displayReadyPromise.then( + function pageDisplayReadyPromise() { + if (self.destroyed) { + complete(); + return; + } + + var gfx = new CanvasGraphics(params.canvasContext, + this.objs, params.textLayer); + try { + this.display(gfx, params.viewport, complete); + } catch (e) { + complete(e); + } + }.bind(this), + function pageDisplayReadPromiseError(reason) { + complete(reason); + } + ); + + return promise; + }, + /** + * For internal use only. + */ + startRenderingFromOperatorList: + function PDFPageWrapper_startRenderingFromOperatorList(operatorList, + fonts) { + var self = this; + this.operatorList = operatorList; + + var displayContinuation = function pageDisplayContinuation() { + // Always defer call to display() to work around bug in + // Firefox error reporting from XHR callbacks. + setTimeout(function pageSetTimeout() { + self.displayReadyPromise.resolve(); + }); + }; + + this.ensureFonts(fonts, + function pageStartRenderingFromOperatorListEnsureFonts() { + displayContinuation(); + } + ); + }, + /** + * For internal use only. + */ + ensureFonts: function PDFPageWrapper_ensureFonts(fonts, callback) { + this.stats.time('Font Loading'); + // Convert the font names to the corresponding font obj. + for (var i = 0, ii = fonts.length; i < ii; i++) { + fonts[i] = this.objs.objs[fonts[i]].data; + } + + // Load all the fonts + FontLoader.bind( + fonts, + function pageEnsureFontsFontObjs(fontObjs) { + this.stats.timeEnd('Font Loading'); + + callback.call(this); + }.bind(this) + ); + }, + /** + * For internal use only. + */ + display: function PDFPageWrapper_display(gfx, viewport, callback) { + var stats = this.stats; + stats.time('Rendering'); + + gfx.beginDrawing(viewport); + + var startIdx = 0; + var length = this.operatorList.fnArray.length; + var operatorList = this.operatorList; + var stepper = null; + if (PDFJS.pdfBug && StepperManager.enabled) { + stepper = StepperManager.create(this.pageNumber - 1); + stepper.init(operatorList); + stepper.nextBreakPoint = stepper.getNextBreakPoint(); + } + + var self = this; + function next() { + startIdx = + gfx.executeOperatorList(operatorList, startIdx, next, stepper); + if (startIdx == length) { + gfx.endDrawing(); + stats.timeEnd('Rendering'); + stats.timeEnd('Overall'); + if (callback) callback(); + } + } + next(); + }, + /** - * Stub for future feature. ++ * @return {Promise} That is resolved with the a {string} that is the text ++ * content from the page. + */ + getTextContent: function() { + var promise = new PDFJS.Promise(); - var textContent = 'page text'; // not implemented - promise.resolve(textContent); ++ this.transport.messageHandler.send('GetTextContent', { ++ pageIndex: this.pageNumber - 1 ++ }, ++ function textContentCallback(textContent) { ++ promise.resolve(textContent); ++ } ++ ); + return promise; + }, + /** + * Stub for future feature. + */ + getOperationList: function() { + var promise = new PDFJS.Promise(); + var operationList = { // not implemented + dependencyFontsID: null, + operatorList: null + }; + promise.resolve(operationList); + return promise; + }, + /** + * Destroys resources allocated by the page. + */ + destroy: function() { + this.destroyed = true; + + if (!this.renderInProgress) { + delete this.operatorList; + delete this.displayReadyPromise; + } + } + }; + return PDFPageProxy; +})(); +/** + * For internal use only. + */ +var WorkerTransport = (function WorkerTransportClosure() { + function WorkerTransport(promise) { + this.workerReadyPromise = promise; + this.objs = new PDFObjects(); + + this.pageCache = []; + this.pagePromises = []; + this.fontsLoading = {}; + + // If worker support isn't disabled explicit and the browser has worker + // support, create a new web worker and test if it/the browser fullfills + // all requirements to run parts of pdf.js in a web worker. + // Right now, the requirement is, that an Uint8Array is still an Uint8Array + // as it arrives on the worker. Chrome added this with version 15. + if (!globalScope.PDFJS.disableWorker && typeof Worker !== 'undefined') { + var workerSrc = PDFJS.workerSrc; + if (typeof workerSrc === 'undefined') { + error('No PDFJS.workerSrc specified'); + } + + try { + var worker; + if (PDFJS.isFirefoxExtension) { + // The firefox extension can't load the worker from the resource:// + // url so we have to inline the script and then use the blob loader. + var bb = new MozBlobBuilder(); + bb.append(document.querySelector('#PDFJS_SCRIPT_TAG').textContent); + var blobUrl = window.URL.createObjectURL(bb.getBlob()); + worker = new Worker(blobUrl); + } else { + // Some versions of FF can't create a worker on localhost, see: + // https://bugzilla.mozilla.org/show_bug.cgi?id=683280 + worker = new Worker(workerSrc); + } + + var messageHandler = new MessageHandler('main', worker); + this.messageHandler = messageHandler; + + messageHandler.on('test', function transportTest(supportTypedArray) { + if (supportTypedArray) { + this.worker = worker; + this.setupMessageHandler(messageHandler); + } else { + globalScope.PDFJS.disableWorker = true; + this.setupFakeWorker(); + } + }.bind(this)); + + var testObj = new Uint8Array(1); + // Some versions of Opera throw a DATA_CLONE_ERR on + // serializing the typed array. + messageHandler.send('test', testObj); + return; + } catch (e) { + warn('The worker has been disabled.'); + } + } + // Either workers are disabled, not supported or have thrown an exception. + // Thus, we fallback to a faked worker. + globalScope.PDFJS.disableWorker = true; + this.setupFakeWorker(); + } + WorkerTransport.prototype = { + destroy: function WorkerTransport_destroy() { + if (this.worker) + this.worker.terminate(); + + this.pageCache = []; + this.pagePromises = []; + }, + setupFakeWorker: function WorkerTransport_setupFakeWorker() { + // If we don't use a worker, just post/sendMessage to the main thread. + var fakeWorker = { + postMessage: function WorkerTransport_postMessage(obj) { + fakeWorker.onmessage({data: obj}); + }, + terminate: function WorkerTransport_terminate() {} + }; + + var messageHandler = new MessageHandler('main', fakeWorker); + this.setupMessageHandler(messageHandler); + + // If the main thread is our worker, setup the handling for the messages + // the main thread sends to it self. + WorkerMessageHandler.setup(messageHandler); + }, + + setupMessageHandler: + function WorkerTransport_setupMessageHandler(messageHandler) { + this.messageHandler = messageHandler; + + messageHandler.on('GetDoc', function transportDoc(data) { + var pdfInfo = data.pdfInfo; + var pdfDocument = new PDFDocumentProxy(pdfInfo, this); + this.pdfDocument = pdfDocument; + this.workerReadyPromise.resolve(pdfDocument); + }, this); + + messageHandler.on('GetPage', function transportPage(data) { + var pageInfo = data.pageInfo; + var page = new PDFPageProxy(pageInfo, this); + this.pageCache[pageInfo.pageIndex] = page; + var promise = this.pagePromises[pageInfo.pageIndex]; + promise.resolve(page); + }, this); + + messageHandler.on('GetAnnotations', function transportAnnotations(data) { + var annotations = data.annotations; + var promise = this.pageCache[data.pageIndex].annotationsPromise; + promise.resolve(annotations); + }, this); + + messageHandler.on('RenderPage', function transportRender(data) { + var page = this.pageCache[data.pageIndex]; + var depFonts = data.depFonts; + + page.stats.timeEnd('Page Request'); + page.startRenderingFromOperatorList(data.operatorList, depFonts); + }, this); + + messageHandler.on('obj', function transportObj(data) { + var id = data[0]; + var type = data[1]; + if (this.objs.hasData(id)) + return; + + switch (type) { + case 'JpegStream': + var imageData = data[2]; + loadJpegStream(id, imageData, this.objs); + break; + case 'Image': + var imageData = data[2]; + this.objs.resolve(id, imageData); + break; + case 'Font': + var name = data[2]; + var file = data[3]; + var properties = data[4]; + + if (file) { + // Rewrap the ArrayBuffer in a stream. + var fontFileDict = new Dict(); + file = new Stream(file, 0, file.length, fontFileDict); + } + + // At this point, only the font object is created but the font is + // not yet attached to the DOM. This is done in `FontLoader.bind`. + var font = new Font(name, file, properties); + this.objs.resolve(id, font); + break; + default: + error('Got unkown object type ' + type); + } + }, this); + + messageHandler.on('PageError', function transportError(data) { + var page = this.pageCache[data.pageNum - 1]; + if (page.displayReadyPromise) + page.displayReadyPromise.reject(data.error); + else + error(data.error); + }, this); + + messageHandler.on('JpegDecode', function(data, promise) { + var imageData = data[0]; + var components = data[1]; + if (components != 3 && components != 1) + error('Only 3 component or 1 component can be returned'); + + var img = new Image(); + img.onload = (function messageHandler_onloadClosure() { + var width = img.width; + var height = img.height; + var size = width * height; + var rgbaLength = size * 4; + var buf = new Uint8Array(size * components); + var tmpCanvas = createScratchCanvas(width, height); + var tmpCtx = tmpCanvas.getContext('2d'); + tmpCtx.drawImage(img, 0, 0); + var data = tmpCtx.getImageData(0, 0, width, height).data; + + if (components == 3) { + for (var i = 0, j = 0; i < rgbaLength; i += 4, j += 3) { + buf[j] = data[i]; + buf[j + 1] = data[i + 1]; + buf[j + 2] = data[i + 2]; + } + } else if (components == 1) { + for (var i = 0, j = 0; i < rgbaLength; i += 4, j++) { + buf[j] = data[i]; + } + } + promise.resolve({ data: buf, width: width, height: height}); + }).bind(this); + var src = 'data:image/jpeg;base64,' + window.btoa(imageData); + img.src = src; + }); + }, + + sendData: function WorkerTransport_sendData(data) { + this.messageHandler.send('GetDocRequest', data); + }, + + getPage: function WorkerTransport_getPage(pageNumber, promise) { + var pageIndex = pageNumber - 1; + if (pageIndex in this.pagePromises) + return this.pagePromises[pageIndex]; + var promise = new PDFJS.Promise('Page ' + pageNumber); + this.pagePromises[pageIndex] = promise; + this.messageHandler.send('GetPageRequest', { pageIndex: pageIndex }); + return promise; + }, + + getAnnotations: function WorkerTransport_getAnnotations(pageIndex) { + this.messageHandler.send('GetAnnotationsRequest', + { pageIndex: pageIndex }); + } + }; + return WorkerTransport; + +})(); diff --cc src/core.js index 90a2c97,38f264b..5ce133d --- a/src/core.js +++ b/src/core.js @@@ -137,8 -203,8 +137,9 @@@ var Page = (function PageClosure() var resources = this.resources; if (isArray(content)) { // fetching items + var streams = []; var i, n = content.length; + var streams = []; for (i = 0; i < n; ++i) streams.push(xref.fetchIfRef(content[i])); content = new StreamsSequenceStream(streams); @@@ -152,9 -218,112 +153,55 @@@ var pe = this.pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); - this.operatorList = pe.getOperatorList(content, resources, dependency); - this.stats.timeEnd('Build IR Queue'); - return this.operatorList; + return pe.getOperatorList(content, resources, dependency); }, - + extractTextContent: function Page_extractTextContent() { + if ('textContent' in this) { + // text content was extracted + return this.textContent; + } + + var handler = { + on: function nullHandlerOn() {}, + send: function nullHandlerSend() {} + }; + + var xref = this.xref; + var content = xref.fetchIfRef(this.content); + var resources = xref.fetchIfRef(this.resources); + if (isArray(content)) { + // fetching items + var i, n = content.length; + var streams = []; + for (i = 0; i < n; ++i) + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.reset(); + + var pe = new PartialEvaluator( + xref, handler, 'p' + this.pageNumber + '_'); + var text = pe.getTextContent(content, resources); + return (this.textContent = text); + }, + ensureFonts: function Page_ensureFonts(fonts, callback) { + this.stats.time('Font Loading'); + // Convert the font names to the corresponding font obj. + for (var i = 0, ii = fonts.length; i < ii; i++) { + fonts[i] = this.objs.objs[fonts[i]].data; + } + + // Load all the fonts + FontLoader.bind( + fonts, + function pageEnsureFontsFontObjs(fontObjs) { + this.stats.timeEnd('Font Loading'); + + callback.call(this); + }.bind(this) + ); + }, - - display: function Page_display(gfx, callback) { - var stats = this.stats; - stats.time('Rendering'); - var xref = this.xref; - var resources = this.resources; - var mediaBox = this.mediaBox; - assertWellFormed(isDict(resources), 'invalid page resources'); - - gfx.xref = xref; - gfx.res = resources; - gfx.beginDrawing({ x: mediaBox[0], y: mediaBox[1], - width: this.width, - height: this.height, - rotate: this.rotate }); - - var startIdx = 0; - var length = this.operatorList.fnArray.length; - var operatorList = this.operatorList; - var stepper = null; - if (PDFJS.pdfBug && StepperManager.enabled) { - stepper = StepperManager.create(this.pageNumber); - stepper.init(operatorList); - stepper.nextBreakPoint = stepper.getNextBreakPoint(); - } - - var self = this; - function next() { - startIdx = - gfx.executeOperatorList(operatorList, startIdx, next, stepper); - if (startIdx == length) { - gfx.endDrawing(); - stats.timeEnd('Rendering'); - stats.timeEnd('Overall'); - if (callback) callback(); - } - } - next(); - }, - rotatePoint: function Page_rotatePoint(x, y, reverse) { - var rotate = reverse ? (360 - this.rotate) : this.rotate; - switch (rotate) { - case 180: - return {x: this.width - x, y: y}; - case 90: - return {x: this.width - y, y: this.height - x}; - case 270: - return {x: y, y: x}; - case 360: - case 0: - default: - return {x: x, y: this.height - y}; - } - }, getLinks: function Page_getLinks() { var links = []; var annotations = pageGetAnnotations(); @@@ -464,5 -668,279 +511,6 @@@ var PDFDocument = (function PDFDocument } }; - return PDFDocModel; + return PDFDocument; })(); + -var PDFDoc = (function PDFDocClosure() { - function PDFDoc(arg, callback) { - var stream = null; - var data = null; - - if (isStream(arg)) { - stream = arg; - data = arg.bytes; - } else if (isArrayBuffer(arg)) { - stream = new Stream(arg); - data = arg; - } else { - error('PDFDoc: Unknown argument type'); - } - - this.data = data; - this.stream = stream; - this.pdfModel = new PDFDocModel(stream); - this.fingerprint = this.pdfModel.getFingerprint(); - this.info = this.pdfModel.getDocumentInfo(); - this.catalog = this.pdfModel.catalog; - this.objs = new PDFObjects(); - - this.pageCache = []; - this.fontsLoading = {}; - this.workerReadyPromise = new Promise('workerReady'); - - this.pageText = []; - this.startedTextExtraction = false; - - // If worker support isn't disabled explicit and the browser has worker - // support, create a new web worker and test if it/the browser fullfills - // all requirements to run parts of pdf.js in a web worker. - // Right now, the requirement is, that an Uint8Array is still an Uint8Array - // as it arrives on the worker. Chrome added this with version 15. - if (!globalScope.PDFJS.disableWorker && typeof Worker !== 'undefined') { - var workerSrc = PDFJS.workerSrc; - if (typeof workerSrc === 'undefined') { - error('No PDFJS.workerSrc specified'); - } - - try { - var worker; - if (PDFJS.isFirefoxExtension) { - // The firefox extension can't load the worker from the resource:// - // url so we have to inline the script and then use the blob loader. - var bb = new MozBlobBuilder(); - bb.append(document.querySelector('#PDFJS_SCRIPT_TAG').textContent); - var blobUrl = window.URL.createObjectURL(bb.getBlob()); - worker = new Worker(blobUrl); - } else { - // Some versions of FF can't create a worker on localhost, see: - // https://bugzilla.mozilla.org/show_bug.cgi?id=683280 - worker = new Worker(workerSrc); - } - - var messageHandler = new MessageHandler('main', worker); - - messageHandler.on('test', function pdfDocTest(supportTypedArray) { - if (supportTypedArray) { - this.worker = worker; - this.setupMessageHandler(messageHandler); - } else { - globalScope.PDFJS.disableWorker = true; - this.setupFakeWorker(); - } - }.bind(this)); - - var testObj = new Uint8Array(1); - // Some versions of Opera throw a DATA_CLONE_ERR on - // serializing the typed array. - messageHandler.send('test', testObj); - return; - } catch (e) { - warn('The worker has been disabled.'); - } - } - // Either workers are disabled, not supported or have thrown an exception. - // Thus, we fallback to a faked worker. - globalScope.PDFJS.disableWorker = true; - this.setupFakeWorker(); - } - - PDFDoc.prototype = { - setupFakeWorker: function PDFDoc_setupFakeWorker() { - // If we don't use a worker, just post/sendMessage to the main thread. - var fakeWorker = { - postMessage: function PDFDoc_postMessage(obj) { - fakeWorker.onmessage({data: obj}); - }, - terminate: function PDFDoc_terminate() {} - }; - - var messageHandler = new MessageHandler('main', fakeWorker); - this.setupMessageHandler(messageHandler); - - // If the main thread is our worker, setup the handling for the messages - // the main thread sends to it self. - WorkerMessageHandler.setup(messageHandler); - }, - - setupMessageHandler: function PDFDoc_setupMessageHandler(messageHandler) { - this.messageHandler = messageHandler; - - messageHandler.on('page', function pdfDocPage(data) { - var pageNum = data.pageNum; - var page = this.pageCache[pageNum]; - var depFonts = data.depFonts; - - page.stats.timeEnd('Page Request'); - page.startRenderingFromOperatorList(data.operatorList, depFonts); - }, this); - - messageHandler.on('obj', function pdfDocObj(data) { - var id = data[0]; - var type = data[1]; - - switch (type) { - case 'JpegStream': - var imageData = data[2]; - loadJpegStream(id, imageData, this.objs); - break; - case 'Image': - var imageData = data[2]; - this.objs.resolve(id, imageData); - break; - case 'Font': - var name = data[2]; - var file = data[3]; - var properties = data[4]; - - if (file) { - // Rewrap the ArrayBuffer in a stream. - var fontFileDict = new Dict(); - file = new Stream(file, 0, file.length, fontFileDict); - } - - // At this point, only the font object is created but the font is - // not yet attached to the DOM. This is done in `FontLoader.bind`. - var font = new Font(name, file, properties); - this.objs.resolve(id, font); - break; - default: - error('Got unkown object type ' + type); - } - }, this); - - messageHandler.on('page_error', function pdfDocError(data) { - var page = this.pageCache[data.pageNum]; - if (page.displayReadyPromise) - page.displayReadyPromise.reject(data.error); - else - error(data.error); - }, this); - - messageHandler.on('text_extracted', function pdfTextExtracted(data) { - var pageNum = data[0]; - var content = data[1]; - if (pageNum !== this.pageText.length + 1) - error('pdfTextExtracted: pageIdx and pageText length got to fit'); - - this.pageText.push(content); - - if (this.textExtracted) - this.textExtracted(pageNum, content); - - if (pageNum < this.numPages) - this.extractTextPage(pageNum + 1); - }, this); - - messageHandler.on('jpeg_decode', function(data, promise) { - var imageData = data[0]; - var components = data[1]; - if (components != 3 && components != 1) - error('Only 3 component or 1 component can be returned'); - - var img = new Image(); - img.onload = (function messageHandler_onloadClosure() { - var width = img.width; - var height = img.height; - var size = width * height; - var rgbaLength = size * 4; - var buf = new Uint8Array(size * components); - var tmpCanvas = createScratchCanvas(width, height); - var tmpCtx = tmpCanvas.getContext('2d'); - tmpCtx.drawImage(img, 0, 0); - var data = tmpCtx.getImageData(0, 0, width, height).data; - - if (components == 3) { - for (var i = 0, j = 0; i < rgbaLength; i += 4, j += 3) { - buf[j] = data[i]; - buf[j + 1] = data[i + 1]; - buf[j + 2] = data[i + 2]; - } - } else if (components == 1) { - for (var i = 0, j = 0; i < rgbaLength; i += 4, j++) { - buf[j] = data[i]; - } - } - promise.resolve({ data: buf, width: width, height: height}); - }).bind(this); - var src = 'data:image/jpeg;base64,' + window.btoa(imageData); - img.src = src; - }); - - setTimeout(function pdfDocFontReadySetTimeout() { - messageHandler.send('doc', this.data); - this.workerReadyPromise.resolve(true); - }.bind(this)); - }, - - get numPages() { - return this.pdfModel.numPages; - }, - - startRendering: function PDFDoc_startRendering(page) { - // The worker might not be ready to receive the page request yet. - this.workerReadyPromise.then(function pdfDocStartRenderingThen() { - page.stats.time('Page Request'); - this.messageHandler.send('page_request', page.pageNumber + 1); - }.bind(this)); - }, - - getPage: function PDFDoc_getPage(n) { - if (this.pageCache[n]) - return this.pageCache[n]; - - var page = this.pdfModel.getPage(n); - // Add a reference to the objects such that Page can forward the reference - // to the CanvasGraphics and so on. - page.objs = this.objs; - page.pdf = this; - return (this.pageCache[n] = page); - }, - - extractTextPage: function PDFDoc_extractTextPage(pageNum) { - this.messageHandler.send('extract_text', pageNum); - }, - - extractText: function PDFDoc_extractText() { - if (this.startedTextExtraction) - return; - - this.startedTextExtraction = true; - - this.workerReadyPromise.then(function pdfDocStartRenderingThen() { - // Start the text extraction process. - this.extractTextPage(1); - }.bind(this)); - }, - - destroy: function PDFDoc_destroy() { - if (this.worker) - this.worker.terminate(); - - if (this.fontWorker) - this.fontWorker.terminate(); - - for (var n in this.pageCache) - delete this.pageCache[n]; - - delete this.data; - delete this.stream; - delete this.pdf; - delete this.catalog; - } - }; - - return PDFDoc; -})(); - -globalScope.PDFJS.PDFDoc = PDFDoc; - diff --cc src/evaluator.js index e073942,fd1c6d6..6cefa57 --- a/src/evaluator.js +++ b/src/evaluator.js @@@ -153,8 -153,7 +153,9 @@@ var PartialEvaluator = (function Partia font = xref.fetchIfRef(font) || fontRes.get(fontName); assertWellFormed(isDict(font)); - ++self.objIdCounter; - if (!font.translated) { ++ + if (!font.loadedName) { ++ ++self.objIdCounter; font.translated = self.translateFont(font, xref, resources, dependency); if (font.translated) { diff --cc src/worker.js index 25f3f52,b7679bd..b3ba767 --- a/src/worker.js +++ b/src/worker.js @@@ -85,45 -85,15 +85,44 @@@ var WorkerMessageHandler = handler.send('test', data instanceof Uint8Array); }); - handler.on('doc', function wphSetupDoc(data) { + handler.on('GetDocRequest', function wphSetupDoc(data) { // Create only the model of the PDFDoc, which is enough for // processing the content of the pdf. - pdfModel = new PDFDocModel(new Stream(data)); + pdfModel = new PDFDocument(new Stream(data)); + var doc = { + numPages: pdfModel.numPages, + fingerprint: pdfModel.getFingerprint(), + destinations: pdfModel.catalog.destinations, + outline: pdfModel.catalog.documentOutline, + info: pdfModel.getDocumentInfo(), + metadata: pdfModel.catalog.metadata + }; + handler.send('GetDoc', {pdfInfo: doc}); }); - handler.on('page_request', function wphSetupPageRequest(pageNum) { - pageNum = parseInt(pageNum); + handler.on('GetPageRequest', function wphSetupGetPage(data) { + var pageNumber = data.pageIndex + 1; + var pdfPage = pdfModel.getPage(pageNumber); + var page = { + pageIndex: data.pageIndex, + rotate: pdfPage.rotate, + ref: pdfPage.ref, + view: pdfPage.view + }; + handler.send('GetPage', {pageInfo: page}); + }); + + handler.on('GetAnnotationsRequest', function wphSetupGetAnnotations(data) { + var pdfPage = pdfModel.getPage(data.pageIndex + 1); + handler.send('GetAnnotations', { + pageIndex: data.pageIndex, + annotations: pdfPage.getAnnotations() + }); + }); + + handler.on('RenderPageRequest', function wphSetupRenderPage(data) { + var pageNum = data.pageIndex + 1; - // The following code does quite the same as // Page.prototype.startRendering, but stops at one point and sends the // result back to the main thread. @@@ -183,6 -154,22 +182,24 @@@ depFonts: Object.keys(fonts) }); }, this); + - handler.on('extract_text', function wphExtractText(pageNum) { ++ handler.on('GetTextContent', function wphExtractText(data, promise) { ++ var pageNum = data.pageIndex + 1; + var start = Date.now(); + + var textContent = ''; + try { + var page = pdfModel.getPage(pageNum); + textContent = page.extractTextContent(); ++ promise.resolve(textContent); + } catch (e) { + // Skip errored pages ++ promise.reject(e); + } + + console.log('text indexing: page=%d - time=%dms', + pageNum, Date.now() - start); - handler.send('text_extracted', [pageNum, textContent]); + }); } }; diff --cc web/viewer.js index 340cf20,ec18ea5..054c08f --- a/web/viewer.js +++ b/web/viewer.js @@@ -214,6 -214,6 +214,8 @@@ var PDFView = currentScale: kUnknownScale, currentScaleValue: null, initialBookmark: document.location.hash.substring(1), ++ startedTextExtraction: false, ++ pageText: [], setScale: function pdfViewSetScale(val, resetAutoSettings) { if (val == this.currentScale) @@@ -590,8 -558,95 +592,69 @@@ // Setting the default one. this.parseScale(kDefaultScale, true); } - - this.metadata = null; - var metadata = pdf.catalog.metadata; - var info = this.documentInfo = pdf.info; - var pdfTitle; - - if (metadata) { - this.metadata = metadata = new PDFJS.Metadata(metadata); - - if (metadata.has('dc:title')) - pdfTitle = metadata.get('dc:title'); - } - - if (!pdfTitle && info && info['Title']) - pdfTitle = info['Title']; - - if (pdfTitle) - document.title = pdfTitle + ' - ' + document.title; - - pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) { - this.search(); - }).bind(this); - }, - - startTextExtraction: function pdfViewStartTextExtraction(pdf) { - var searchResults = document.getElementById('searchResults'); - searchResults.textContent = ''; - - this.pdfDoc.extractText(); }, + search: function pdfViewStartSearch() { + // Limit this function to run every ms. + var SEARCH_TIMEOUT = 250; + var lastSeach = this.lastSearch; + var now = Date.now(); + if (lastSeach && (now - lastSeach) < SEARCH_TIMEOUT) { - if (!this.searchTimer) - this.searchTimer = - setTimeout(this.search, SEARCH_TIMEOUT - (now - lastSeach)); - ++ if (!this.searchTimer) { ++ this.searchTimer = setTimeout(function resumeSearch() { ++ PDFView.search(); ++ }, ++ SEARCH_TIMEOUT - (now - lastSeach) ++ ); ++ } + return; + } + this.searchTimer = null; + this.lastSearch = now; + + function bindLink(link, pageNumber) { + link.href = '#' + pageNumber; + link.onclick = function searchBindLink() { + PDFView.page = pageNumber; + return false; + }; + } + + var searchResults = document.getElementById('searchResults'); + + var searchTermsInput = document.getElementById('searchTermsInput'); + searchResults.removeAttribute('hidden'); + searchResults.textContent = ''; + + var terms = searchTermsInput.value; + + if (!terms) + return; + + // simple search: removing spaces and hyphens, then scanning every + terms = terms.replace(/\s-/g, '').toLowerCase(); - var index = PDFView.pdfDoc.pageText; ++ var index = PDFView.pageText; + var pageFound = false; + for (var i = 0, ii = index.length; i < ii; i++) { + var pageText = index[i].replace(/\s-/g, '').toLowerCase(); + var j = pageText.indexOf(terms); + if (j < 0) + continue; + + var pageNumber = i + 1; + var textSample = index[i].substr(j, 50); + var link = document.createElement('a'); + bindLink(link, pageNumber); + link.textContent = 'Page ' + pageNumber + ': ' + textSample; + searchResults.appendChild(link); + + pageFound = true; + } + if (!pageFound) { + searchResults.textContent = '(Not found)'; + } + }, + setHash: function pdfViewSetHash(hash) { if (!hash) return; @@@ -633,26 -688,42 +696,60 @@@ switchSidebarView: function pdfViewSwitchSidebarView(view) { var thumbsScrollView = document.getElementById('sidebarScrollView'); - var outlineScrollView = document.getElementById('outlineScrollView'); var thumbsSwitchButton = document.getElementById('thumbsSwitch'); + if (view == 'thumbs') { + thumbsScrollView.removeAttribute('hidden'); + thumbsSwitchButton.setAttribute('data-selected', true); + } else { + thumbsScrollView.setAttribute('hidden', 'true'); + thumbsSwitchButton.removeAttribute('data-selected'); + } + + var outlineScrollView = document.getElementById('outlineScrollView'); var outlineSwitchButton = document.getElementById('outlineSwitch'); - switch (view) { - case 'thumbs': - thumbsScrollView.removeAttribute('hidden'); - outlineScrollView.setAttribute('hidden', 'true'); - thumbsSwitchButton.setAttribute('data-selected', true); - outlineSwitchButton.removeAttribute('data-selected'); - updateThumbViewArea(); - break; - case 'outline': - thumbsScrollView.setAttribute('hidden', 'true'); - outlineScrollView.removeAttribute('hidden'); - thumbsSwitchButton.removeAttribute('data-selected'); - outlineSwitchButton.setAttribute('data-selected', true); - break; + if (view == 'outline') { + outlineScrollView.removeAttribute('hidden'); + outlineSwitchButton.setAttribute('data-selected', true); + } else { + outlineScrollView.setAttribute('hidden', 'true'); + outlineSwitchButton.removeAttribute('data-selected'); + } + + var searchScrollView = document.getElementById('searchScrollView'); + var searchSwitchButton = document.getElementById('searchSwitch'); + if (view == 'search') { + searchScrollView.removeAttribute('hidden'); + searchSwitchButton.setAttribute('data-selected', true); + + var searchTermsInput = document.getElementById('searchTermsInput'); + searchTermsInput.focus(); + + // Start text extraction as soon as the search gets displayed. - this.pdfDoc.extractText(); ++ this.extractText(); + } else { + searchScrollView.setAttribute('hidden', 'true'); + searchSwitchButton.removeAttribute('data-selected'); } }, ++ extractText: function() { ++ if (this.startedTextExtraction) ++ return; ++ this.startedTextExtraction = true; ++ var self = this; ++ function extractPageText(pageIndex) { ++ self.pages[pageIndex].pdfPage.getTextContent().then( ++ function textContentResolved(textContent) { ++ self.pageText[pageIndex] = textContent; ++ self.search(); ++ if ((pageIndex + 1) < self.pages.length) ++ extractPageText(pageIndex + 1); ++ } ++ ); ++ }; ++ extractPageText(0); ++ }, ++ pinSidebar: function pdfViewPinSidebar() { document.getElementById('sidebar').classList.toggle('pinned'); },