Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.text.SimpleDateFormat;
028import java.util.Calendar;
029import java.util.HashMap;
030import java.util.LinkedHashMap;
031import java.util.Locale;
032import java.util.Map;
033
034import org.apache.pdfbox.pdmodel.PDDocument;
035import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
036import org.apache.pdfbox.pdmodel.PDDocumentInformation;
037import org.apache.pdfbox.pdmodel.PDPage;
038import org.apache.pdfbox.pdmodel.common.PDMetadata;
039import org.apache.pdfbox.pdmodel.common.PDRectangle;
040import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
041import org.nuxeo.ecm.core.api.Blob;
042import org.nuxeo.ecm.core.api.CoreSession;
043import org.nuxeo.ecm.core.api.DocumentModel;
044import org.nuxeo.ecm.core.api.NuxeoException;
045
046/**
047 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or
048 * <code>toString()</code>) or via individual getters.
049 * <p>
050 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call.
051 * <p>
052 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details.
053 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches.
054 *
055 * @since 8.10
056 */
057public class PDFInfo {
058
059    private Blob pdfBlob;
060
061    private int numberOfPages = -1;
062
063    private float mediaBoxWidthInPoints = 0.0f;
064
065    private float mediaBoxHeightInPoints = 0.0f;
066
067    private float cropBoxWidthInPoints = 0.0f;
068
069    private float cropBoxHeightInPoints = 0.0f;
070
071    private long fileSize = -1;
072
073    private boolean isEncrypted;
074
075    private boolean doXMP = false;
076
077    private boolean alreadyParsed = false;
078
079    private String password;
080
081    private String author = "";
082
083    private String contentCreator = "";
084
085    private String fileName = "";
086
087    private String keywords = "";
088
089    private String pageLayout = "";
090
091    private String pdfVersion = "";
092
093    private String producer = "";
094
095    private String subject = "";
096
097    private String title;
098
099    private String xmp;
100
101    private Calendar creationDate;
102
103    private Calendar modificationDate;
104
105    private AccessPermission permissions;
106
107    private LinkedHashMap<String, String> cachedMap;
108
109    /**
110     * Constructor with a Blob.
111     *
112     * @param inBlob Input blob.
113     */
114    public PDFInfo(Blob inBlob) {
115        this(inBlob, null);
116    }
117
118    /**
119     * Constructor for Blob + encrypted PDF.
120     *
121     * @param inBlob Input blob.
122     * @param inPassword If the PDF is encrypted.
123     */
124    public PDFInfo(Blob inBlob, String inPassword) {
125        pdfBlob = inBlob;
126        password = inPassword;
127        title = "";
128    }
129
130    /**
131     * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the
132     * document.
133     *
134     * @param inDoc Input DocumentModel.
135     */
136    public PDFInfo(DocumentModel inDoc) {
137        this(inDoc, null, null);
138    }
139
140    /**
141     * Constructor for DocumentModel + encrypted PDF
142     * <p>
143     * If {@code inXPath} is {@code null} or {@code ""}, it is set to the default {@code file:content} value.
144     *
145     * @param inDoc Input DocumentModel.
146     * @param inXPath Input XPath.
147     * @param inPassword If the PDF is encrypted.
148     */
149    public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) {
150        if (inXPath == null || inXPath.isEmpty()) {
151            inXPath = "file:content";
152        }
153        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
154        password = inPassword;
155        title = "";
156    }
157
158    /**
159     * If set to true, parsing will extract PDF.
160     * <p>
161     * The value cannot be modified if <code>run()</code> already has been called.
162     *
163     * @param inValue true to extract XMP.
164     */
165    public void setParseWithXMP(boolean inValue) {
166        if (alreadyParsed && doXMP != inValue) {
167            throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed.");
168        }
169        doXMP = inValue;
170    }
171
172    private String checkNotNull(String inValue) {
173        return inValue == null ? "" : inValue;
174    }
175
176    /**
177     * After building the object with the correct constructor, and after possibly having set some parsing property
178     * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF.
179     * <p>
180     * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or
181     * <code>toString()</code>) or individual info (see all getters).
182     */
183    public void run() throws NuxeoException {
184        // In case the caller calls several time the run() method
185        if (alreadyParsed) {
186            return;
187        }
188        fileName = pdfBlob.getFilename();
189        File pdfFile = pdfBlob.getFile();
190        fileSize = (pdfFile == null) ? -1 : pdfFile.length();
191        try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream(), password)) {
192            isEncrypted = pdfDoc.isEncrypted();
193            numberOfPages = pdfDoc.getNumberOfPages();
194            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
195            pageLayout = docCatalog.getPageLayout().stringValue();
196            pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion());
197            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
198            author = checkNotNull(docInfo.getAuthor());
199            contentCreator = checkNotNull(docInfo.getCreator());
200            keywords = checkNotNull(docInfo.getKeywords());
201            creationDate = docInfo.getCreationDate();
202            modificationDate = docInfo.getModificationDate();
203            producer = checkNotNull(docInfo.getProducer());
204            subject = checkNotNull(docInfo.getSubject());
205            title = checkNotNull(docInfo.getTitle());
206            permissions = pdfDoc.getCurrentAccessPermission();
207            // Getting dimension is a bit tricky
208            mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1;
209            boolean gotMediaBox = false, gotCropBox = false;
210            for (PDPage page : docCatalog.getPages()) {
211                if (page != null) {
212                    PDRectangle r = page.getMediaBox();
213                    if (r != null) {
214                        mediaBoxWidthInPoints = r.getWidth();
215                        mediaBoxHeightInPoints = r.getHeight();
216                        gotMediaBox = true;
217                    }
218                    r = page.getCropBox();
219                    if (r != null) {
220                        cropBoxWidthInPoints = r.getWidth();
221                        cropBoxHeightInPoints = r.getHeight();
222                        gotCropBox = true;
223                    }
224                }
225                if (gotMediaBox && gotCropBox) {
226                    break;
227                }
228            }
229            if (doXMP) {
230                xmp = null;
231                PDMetadata metadata = docCatalog.getMetadata();
232                if (metadata != null) {
233                    xmp = "";
234                    try (InputStream xmlInputStream = metadata.createInputStream(); //
235                            BufferedReader reader = new BufferedReader(new InputStreamReader(xmlInputStream))) {
236                        String line;
237                        do {
238                            line = reader.readLine();
239                            if (line != null) {
240                                xmp += line + "\n";
241                            }
242                        } while (line != null);
243                    }
244                }
245            }
246            alreadyParsed = true;
247        } catch (IOException e) {
248            throw new NuxeoException(e);
249        }
250    }
251
252    /**
253     * Return all and every parsed info in a String <code>HashMap</code>.
254     * <p>
255     * Possible values are:
256     * <ul>
257     * <li>File name</li>
258     * <li>File size</li>
259     * <li>PDF version</li>
260     * <li>Page count</li>
261     * <li>Page size</li>
262     * <li>Page width</li>
263     * <li>Page height</li>
264     * <li>Page layout</li>
265     * <li>Title</li>
266     * <li>Author</li>
267     * <li>Subject</li>
268     * <li>PDF producer</li>
269     * <li>Content creator</li>
270     * <li>Creation date</li>
271     * </ul>
272     */
273    public HashMap<String, String> toHashMap() {
274        // Parse if needed
275        run();
276        if (cachedMap == null) {
277            cachedMap = new LinkedHashMap<>();
278            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
279            cachedMap.put("File name", fileName);
280            cachedMap.put("File size", String.valueOf(fileSize));
281            cachedMap.put("PDF version", pdfVersion);
282            cachedMap.put("Page count", String.valueOf(numberOfPages));
283            cachedMap.put("Page size",
284                    String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints));
285            cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints));
286            cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints));
287            cachedMap.put("Page layout", pageLayout);
288            cachedMap.put("Title", title);
289            cachedMap.put("Author", author);
290            cachedMap.put("Subject", subject);
291            cachedMap.put("PDF producer", producer);
292            cachedMap.put("Content creator", contentCreator);
293            if (creationDate != null) {
294                cachedMap.put("Creation date", dateFormat.format(creationDate.getTime()));
295            } else {
296                cachedMap.put("Creation date", "");
297            }
298            if (modificationDate != null) {
299                cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime()));
300            } else {
301                cachedMap.put("Modification date", "");
302            }
303            // "Others"
304            cachedMap.put("Encrypted", String.valueOf(isEncrypted));
305            cachedMap.put("Keywords", keywords);
306            cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints));
307            cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints));
308            cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints));
309            cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints));
310            if(permissions != null) {
311                cachedMap.put("Can Print", String.valueOf(permissions.canPrint()));
312                cachedMap.put("Can Modify", String.valueOf(permissions.canModify()));
313                cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent()));
314                cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations()));
315                cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm()));
316                cachedMap.put("Can Extract for Accessibility", String.valueOf(
317                    permissions.canExtractForAccessibility()));
318                cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument()));
319                cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded()));
320            }
321        }
322        return cachedMap;
323    }
324
325    /**
326     * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value
327     * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example:
328     * <p>
329     * <pre><code>
330     * pdfinfo:title=Title
331     * pdfinfo:producer=PDF Producer
332     * pdfinfo:mediabox_width=Media box width
333     * ...
334     * </code></pre>
335     * <p>
336     * If <code>inSave</code> is false, inSession can be null.
337     *
338     * @param inDoc Input DocumentModel.
339     * @param inMapping Input Mapping.
340     * @param inSave Whether should save.
341     * @param inSession If is saving, should do it in this particular session.
342     */
343    public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave,
344                                  CoreSession inSession) {
345        // Parse if needed
346        run();
347        Map<String, String> values = toHashMap();
348        for (String inXPath : inMapping.keySet()) {
349            String value = values.get(inMapping.get(inXPath));
350            inDoc.setPropertyValue(inXPath, value);
351        }
352        if (inSave) {
353            inDoc = inSession.saveDocument(inDoc);
354        }
355        return inDoc;
356    }
357
358    /**
359     * Wrapper for <code>toHashMap().toString()</code>
360     */
361    @Override
362    public String toString() {
363        return toHashMap().toString();
364    }
365
366    public int getNumberOfPages() {
367        return numberOfPages;
368    }
369
370    public float getMediaBoxWidthInPoints() {
371        return mediaBoxWidthInPoints;
372    }
373
374    public float getMediaBoxHeightInPoints() {
375        return mediaBoxHeightInPoints;
376    }
377
378    public float getCropBoxWidthInPoints() {
379        return cropBoxWidthInPoints;
380    }
381
382    public float getCropBoxHeightInPoints() {
383        return cropBoxHeightInPoints;
384    }
385
386    public long getFileSize() {
387        return fileSize;
388    }
389
390    public boolean isEncrypted() {
391        return isEncrypted;
392    }
393
394    public String getAuthor() {
395        return author;
396    }
397
398    public String getContentCreator() {
399        return contentCreator;
400    }
401
402    public String getFileName() {
403        return fileName;
404    }
405
406    public String getKeywords() {
407        return keywords;
408    }
409
410    public String getPageLayout() {
411        return pageLayout;
412    }
413
414    public String getPdfVersion() {
415        return pdfVersion;
416    }
417
418    public String getProducer() {
419        return producer;
420    }
421
422    public String getSubject() {
423        return subject;
424    }
425
426    public String getTitle() {
427        return title;
428    }
429
430    public String getXmp() {
431        return xmp;
432    }
433
434    public Calendar getCreationDate() {
435        return creationDate;
436    }
437
438    public Calendar getModificationDate() {
439        return modificationDate;
440    }
441
442    public AccessPermission getPermissions() {
443        return permissions;
444    }
445
446}