Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.text.SimpleDateFormat;
028import java.util.Calendar;
029import java.util.HashMap;
030import java.util.LinkedHashMap;
031import java.util.List;
032import java.util.Map;
033import org.apache.pdfbox.exceptions.CryptographyException;
034import org.apache.pdfbox.pdmodel.PDDocument;
035import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
036import org.apache.pdfbox.pdmodel.PDDocumentInformation;
037import org.apache.pdfbox.pdmodel.PDPage;
038import org.apache.pdfbox.pdmodel.common.PDMetadata;
039import org.apache.pdfbox.pdmodel.common.PDRectangle;
040import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
041import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
042import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
043import org.nuxeo.ecm.core.api.Blob;
044import org.nuxeo.ecm.core.api.NuxeoException;
045import org.nuxeo.ecm.core.api.CoreSession;
046import org.nuxeo.ecm.core.api.DocumentModel;
047
048/**
049 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or
050 * <code>toString()</code>) or via individual getters.
051 * <p>
052 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call.
053 * <p>
054 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details.
055 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches.
056 *
057 * @since 8.10
058 */
059public class PDFInfo {
060
061    private Blob pdfBlob;
062
063    private int numberOfPages = -1;
064
065    private float mediaBoxWidthInPoints = 0.0f;
066
067    private float mediaBoxHeightInPoints = 0.0f;
068
069    private float cropBoxWidthInPoints = 0.0f;
070
071    private float cropBoxHeightInPoints = 0.0f;
072
073    private long fileSize = -1;
074
075    private boolean isEncrypted;
076
077    private boolean doXMP = false;
078
079    private boolean alreadyParsed = false;
080
081    private String password;
082
083    private String author = "";
084
085    private String contentCreator = "";
086
087    private String fileName = "";
088
089    private String keywords = "";
090
091    private String pageLayout = "";
092
093    private String pdfVersion = "";
094
095    private String producer = "";
096
097    private String subject = "";
098
099    private String title;
100
101    private String xmp;
102
103    private Calendar creationDate;
104
105    private Calendar modificationDate;
106
107    private AccessPermission permissions;
108
109    private LinkedHashMap<String, String> cachedMap;
110
111    /**
112     * Constructor with a Blob.
113     *
114     * @param inBlob Input blob.
115     */
116    public PDFInfo(Blob inBlob) {
117        this(inBlob, null);
118    }
119
120    /**
121     * Constructor for Blob + encrypted PDF.
122     *
123     * @param inBlob Input blob.
124     * @param inPassword If the PDF is encrypted.
125     */
126    public PDFInfo(Blob inBlob, String inPassword) {
127        pdfBlob = inBlob;
128        password = inPassword;
129        title = "";
130    }
131
132    /**
133     * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the
134     * document.
135     *
136     * @param inDoc Input DocumentModel.
137     */
138    public PDFInfo(DocumentModel inDoc) {
139        this(inDoc, null, null);
140    }
141
142    /**
143     * Constructor for DocumentModel + encrypted PDF
144     * <p>
145     * If <inXPath</code> is <code>null</code> or "", it is set to the default
146     * <code>file:content</code> value.
147     *
148     * @param inDoc Input DocumentModel.
149     * @param inXPath Input XPath.
150     * @param inPassword If the PDF is encrypted.
151     */
152    public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) {
153        if (inXPath == null || inXPath.isEmpty()) {
154            inXPath = "file:content";
155        }
156        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
157        password = inPassword;
158        title = "";
159    }
160
161    /**
162     * If set to true, parsing will extract PDF.
163     * <p>
164     * The value cannot be modified if <code>run()</code> already has been called.
165     *
166     * @param inValue true to extract XMP.
167     */
168    public void setParseWithXMP(boolean inValue) {
169        if (alreadyParsed && doXMP != inValue) {
170            throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed.");
171        }
172        doXMP = inValue;
173    }
174
175    private String checkNotNull(String inValue) {
176        return inValue == null ? "" : inValue;
177    }
178
179    /**
180     * After building the object with the correct constructor, and after possibly having set some parsing property
181     * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF.
182     * <p>
183     * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or
184     * <code>toString()</code>) or individual info (see all getters).
185     *
186     * @throws NuxeoException
187     */
188    public void run() throws NuxeoException {
189        // In case the caller calls several time the run() method
190        if (alreadyParsed) {
191            return;
192        }
193        fileName = pdfBlob.getFilename();
194        File pdfFile = pdfBlob.getFile();
195        fileSize = (pdfFile == null) ? -1 : pdfFile.length();
196        try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) {
197            isEncrypted = pdfDoc.isEncrypted();
198            if (isEncrypted) {
199                pdfDoc.openProtection(new StandardDecryptionMaterial(password));
200            }
201            numberOfPages = pdfDoc.getNumberOfPages();
202            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
203            pageLayout = checkNotNull(docCatalog.getPageLayout());
204            pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion());
205            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
206            author = checkNotNull(docInfo.getAuthor());
207            contentCreator = checkNotNull(docInfo.getCreator());
208            keywords = checkNotNull(docInfo.getKeywords());
209            try {
210                creationDate = docInfo.getCreationDate();
211            } catch (IOException e) {
212                creationDate = null;
213            }
214            try {
215                modificationDate = docInfo.getModificationDate();
216            } catch (IOException e) {
217                modificationDate = null;
218            }
219            producer = checkNotNull(docInfo.getProducer());
220            subject = checkNotNull(docInfo.getSubject());
221            title = checkNotNull(docInfo.getTitle());
222            permissions = pdfDoc.getCurrentAccessPermission();
223            // Getting dimension is a bit tricky
224            mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1;
225            List allPages = docCatalog.getAllPages();
226            boolean gotMediaBox = false, gotCropBox = false;
227            for (Object pageObject : allPages) {
228                PDPage page = (PDPage) pageObject;
229                if (page != null) {
230                    PDRectangle r = page.findMediaBox();
231                    if (r != null) {
232                        mediaBoxWidthInPoints = r.getWidth();
233                        mediaBoxHeightInPoints = r.getHeight();
234                        gotMediaBox = true;
235                    }
236                    r = page.findCropBox();
237                    if (r != null) {
238                        cropBoxWidthInPoints = r.getWidth();
239                        cropBoxHeightInPoints = r.getHeight();
240                        gotCropBox = true;
241                    }
242                }
243                if (gotMediaBox && gotCropBox) {
244                    break;
245                }
246            }
247            if (doXMP) {
248                xmp = null;
249                PDMetadata metadata = docCatalog.getMetadata();
250                if (metadata != null) {
251                    xmp = "";
252                    InputStream xmlInputStream = metadata.createInputStream();
253                    InputStreamReader isr = new InputStreamReader(xmlInputStream);
254                    BufferedReader reader = new BufferedReader(isr);
255                    String line;
256                    do {
257                        line = reader.readLine();
258                        if (line != null) {
259                            xmp += line + "\n";
260                        }
261                    } while (line != null);
262                    reader.close();
263                }
264            }
265            alreadyParsed = true;
266        } catch (IOException | BadSecurityHandlerException | CryptographyException e) {
267            throw new NuxeoException(e);
268        }
269    }
270
271    /**
272     * Return all and every parsed info in a String <code>HashMap</code>.
273     * <p>
274     * Possible values are:
275     * <ul>
276     * <li>File name</li>
277     * <li>File size</li>
278     * <li>PDF version</li>
279     * <li>Page count</li>
280     * <li>Page size</li>
281     * <li>Page width</li>
282     * <li>Page height</li>
283     * <li>Page layout</li>
284     * <li>Title</li>
285     * <li>Author</li>
286     * <li>Subject</li>
287     * <li>PDF producer</li>
288     * <li>Content creator</li>
289     * <li>Creation date</li>
290     */
291    public HashMap<String, String> toHashMap() {
292        // Parse if needed
293        run();
294        if (cachedMap == null) {
295            cachedMap = new LinkedHashMap<>();
296            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
297            cachedMap.put("File name", fileName);
298            cachedMap.put("File size", String.valueOf(fileSize));
299            cachedMap.put("PDF version", pdfVersion);
300            cachedMap.put("Page count", String.valueOf(numberOfPages));
301            cachedMap.put("Page size", String.format("%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints));
302            cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints));
303            cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints));
304            cachedMap.put("Page layout", pageLayout);
305            cachedMap.put("Title", title);
306            cachedMap.put("Author", author);
307            cachedMap.put("Subject", subject);
308            cachedMap.put("PDF producer", producer);
309            cachedMap.put("Content creator", contentCreator);
310            if (creationDate != null) {
311                cachedMap.put("Creation date", dateFormat.format(creationDate.getTime()));
312            } else {
313                cachedMap.put("Creation date", "");
314            }
315            if (modificationDate != null) {
316                cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime()));
317            } else {
318                cachedMap.put("Modification date", "");
319            }
320            // "Others"
321            cachedMap.put("Encrypted", String.valueOf(isEncrypted));
322            cachedMap.put("Keywords", keywords);
323            cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints));
324            cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints));
325            cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints));
326            cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints));
327            if(permissions != null) {
328                cachedMap.put("Can Print", String.valueOf(permissions.canPrint()));
329                cachedMap.put("Can Modify", String.valueOf(permissions.canModify()));
330                cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent()));
331                cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations()));
332                cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm()));
333                cachedMap.put("Can Extract for Accessibility", String.valueOf(
334                    permissions.canExtractForAccessibility()));
335                cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument()));
336                cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded()));
337            }
338        }
339        return cachedMap;
340    }
341
342    /**
343     * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value
344     * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example:
345     * <p>
346     * <code><pre>
347     * pdfinfo:title=Title
348     * pdfinfo:producer=PDF Producer
349     * pdfinfo:mediabox_width=Media box width
350     * ...
351     * </pre></code>
352     * <p>
353     * If <code>inSave</code> is false, inSession can be null.
354     *
355     * @param inDoc Input DocumentModel.
356     * @param inMapping Input Mapping.
357     * @param inSave Whether should save.
358     * @param inSession If is saving, should do it in this particular session.
359     */
360    public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave,
361                                  CoreSession inSession) {
362        // Parse if needed
363        run();
364        Map<String, String> values = toHashMap();
365        for (String inXPath : inMapping.keySet()) {
366            String value = values.get(inMapping.get(inXPath));
367            inDoc.setPropertyValue(inXPath, value);
368        }
369        if (inSave) {
370            inDoc = inSession.saveDocument(inDoc);
371        }
372        return inDoc;
373    }
374
375    /**
376     * Wrapper for <code>toHashMap().toString()</code>
377     */
378    @Override
379    public String toString() {
380        return toHashMap().toString();
381    }
382
383    public int getNumberOfPages() {
384        return numberOfPages;
385    }
386
387    public float getMediaBoxWidthInPoints() {
388        return mediaBoxWidthInPoints;
389    }
390
391    public float getMediaBoxHeightInPoints() {
392        return mediaBoxHeightInPoints;
393    }
394
395    public float getCropBoxWidthInPoints() {
396        return cropBoxWidthInPoints;
397    }
398
399    public float getCropBoxHeightInPoints() {
400        return cropBoxHeightInPoints;
401    }
402
403    public long getFileSize() {
404        return fileSize;
405    }
406
407    public boolean isEncrypted() {
408        return isEncrypted;
409    }
410
411    public String getAuthor() {
412        return author;
413    }
414
415    public String getContentCreator() {
416        return contentCreator;
417    }
418
419    public String getFileName() {
420        return fileName;
421    }
422
423    public String getKeywords() {
424        return keywords;
425    }
426
427    public String getPageLayout() {
428        return pageLayout;
429    }
430
431    public String getPdfVersion() {
432        return pdfVersion;
433    }
434
435    public String getProducer() {
436        return producer;
437    }
438
439    public String getSubject() {
440        return subject;
441    }
442
443    public String getTitle() {
444        return title;
445    }
446
447    public String getXmp() {
448        return xmp;
449    }
450
451    public Calendar getCreationDate() {
452        return creationDate;
453    }
454
455    public Calendar getModificationDate() {
456        return modificationDate;
457    }
458
459    public AccessPermission getPermissions() {
460        return permissions;
461    }
462
463}