001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.text.SimpleDateFormat;
028import java.util.Calendar;
029import java.util.HashMap;
030import java.util.LinkedHashMap;
031import java.util.List;
032import java.util.Locale;
033import java.util.Map;
034import org.apache.pdfbox.exceptions.CryptographyException;
035import org.apache.pdfbox.pdmodel.PDDocument;
036import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
037import org.apache.pdfbox.pdmodel.PDDocumentInformation;
038import org.apache.pdfbox.pdmodel.PDPage;
039import org.apache.pdfbox.pdmodel.common.PDMetadata;
040import org.apache.pdfbox.pdmodel.common.PDRectangle;
041import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
042import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
043import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
044import org.nuxeo.ecm.core.api.Blob;
045import org.nuxeo.ecm.core.api.NuxeoException;
046import org.nuxeo.ecm.core.api.CoreSession;
047import org.nuxeo.ecm.core.api.DocumentModel;
048
049/**
050 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or
051 * <code>toString()</code>) or via individual getters.
052 * <p>
053 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call.
054 * <p>
055 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details.
056 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches.
057 *
058 * @since 8.10
059 */
060public class PDFInfo {
061
062    private Blob pdfBlob;
063
064    private int numberOfPages = -1;
065
066    private float mediaBoxWidthInPoints = 0.0f;
067
068    private float mediaBoxHeightInPoints = 0.0f;
069
070    private float cropBoxWidthInPoints = 0.0f;
071
072    private float cropBoxHeightInPoints = 0.0f;
073
074    private long fileSize = -1;
075
076    private boolean isEncrypted;
077
078    private boolean doXMP = false;
079
080    private boolean alreadyParsed = false;
081
082    private String password;
083
084    private String author = "";
085
086    private String contentCreator = "";
087
088    private String fileName = "";
089
090    private String keywords = "";
091
092    private String pageLayout = "";
093
094    private String pdfVersion = "";
095
096    private String producer = "";
097
098    private String subject = "";
099
100    private String title;
101
102    private String xmp;
103
104    private Calendar creationDate;
105
106    private Calendar modificationDate;
107
108    private AccessPermission permissions;
109
110    private LinkedHashMap<String, String> cachedMap;
111
112    /**
113     * Constructor with a Blob.
114     *
115     * @param inBlob Input blob.
116     */
117    public PDFInfo(Blob inBlob) {
118        this(inBlob, null);
119    }
120
121    /**
122     * Constructor for Blob + encrypted PDF.
123     *
124     * @param inBlob Input blob.
125     * @param inPassword If the PDF is encrypted.
126     */
127    public PDFInfo(Blob inBlob, String inPassword) {
128        pdfBlob = inBlob;
129        password = inPassword;
130        title = "";
131    }
132
133    /**
134     * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the
135     * document.
136     *
137     * @param inDoc Input DocumentModel.
138     */
139    public PDFInfo(DocumentModel inDoc) {
140        this(inDoc, null, null);
141    }
142
143    /**
144     * Constructor for DocumentModel + encrypted PDF
145     * <p>
146     * If <inXPath</code> is <code>null</code> or "", it is set to the default
147     * <code>file:content</code> value.
148     *
149     * @param inDoc Input DocumentModel.
150     * @param inXPath Input XPath.
151     * @param inPassword If the PDF is encrypted.
152     */
153    public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) {
154        if (inXPath == null || inXPath.isEmpty()) {
155            inXPath = "file:content";
156        }
157        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
158        password = inPassword;
159        title = "";
160    }
161
162    /**
163     * If set to true, parsing will extract PDF.
164     * <p>
165     * The value cannot be modified if <code>run()</code> already has been called.
166     *
167     * @param inValue true to extract XMP.
168     */
169    public void setParseWithXMP(boolean inValue) {
170        if (alreadyParsed && doXMP != inValue) {
171            throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed.");
172        }
173        doXMP = inValue;
174    }
175
176    private String checkNotNull(String inValue) {
177        return inValue == null ? "" : inValue;
178    }
179
180    /**
181     * After building the object with the correct constructor, and after possibly having set some parsing property
182     * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF.
183     * <p>
184     * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or
185     * <code>toString()</code>) or individual info (see all getters).
186     *
187     * @throws NuxeoException
188     */
189    public void run() throws NuxeoException {
190        // In case the caller calls several time the run() method
191        if (alreadyParsed) {
192            return;
193        }
194        fileName = pdfBlob.getFilename();
195        File pdfFile = pdfBlob.getFile();
196        fileSize = (pdfFile == null) ? -1 : pdfFile.length();
197        try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) {
198            isEncrypted = pdfDoc.isEncrypted();
199            if (isEncrypted) {
200                pdfDoc.openProtection(new StandardDecryptionMaterial(password));
201            }
202            numberOfPages = pdfDoc.getNumberOfPages();
203            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
204            pageLayout = checkNotNull(docCatalog.getPageLayout());
205            pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion());
206            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
207            author = checkNotNull(docInfo.getAuthor());
208            contentCreator = checkNotNull(docInfo.getCreator());
209            keywords = checkNotNull(docInfo.getKeywords());
210            try {
211                creationDate = docInfo.getCreationDate();
212            } catch (IOException e) {
213                creationDate = null;
214            }
215            try {
216                modificationDate = docInfo.getModificationDate();
217            } catch (IOException e) {
218                modificationDate = null;
219            }
220            producer = checkNotNull(docInfo.getProducer());
221            subject = checkNotNull(docInfo.getSubject());
222            title = checkNotNull(docInfo.getTitle());
223            permissions = pdfDoc.getCurrentAccessPermission();
224            // Getting dimension is a bit tricky
225            mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1;
226            List allPages = docCatalog.getAllPages();
227            boolean gotMediaBox = false, gotCropBox = false;
228            for (Object pageObject : allPages) {
229                PDPage page = (PDPage) pageObject;
230                if (page != null) {
231                    PDRectangle r = page.findMediaBox();
232                    if (r != null) {
233                        mediaBoxWidthInPoints = r.getWidth();
234                        mediaBoxHeightInPoints = r.getHeight();
235                        gotMediaBox = true;
236                    }
237                    r = page.findCropBox();
238                    if (r != null) {
239                        cropBoxWidthInPoints = r.getWidth();
240                        cropBoxHeightInPoints = r.getHeight();
241                        gotCropBox = true;
242                    }
243                }
244                if (gotMediaBox && gotCropBox) {
245                    break;
246                }
247            }
248            if (doXMP) {
249                xmp = null;
250                PDMetadata metadata = docCatalog.getMetadata();
251                if (metadata != null) {
252                    xmp = "";
253                    try (InputStream xmlInputStream = metadata.createInputStream(); //
254                            BufferedReader reader = new BufferedReader(new InputStreamReader(xmlInputStream))) {
255                        String line;
256                        do {
257                            line = reader.readLine();
258                            if (line != null) {
259                                xmp += line + "\n";
260                            }
261                        } while (line != null);
262                    }
263                }
264            }
265            alreadyParsed = true;
266        } catch (IOException | BadSecurityHandlerException | CryptographyException e) {
267            throw new NuxeoException(e);
268        }
269    }
270
271    /**
272     * Return all and every parsed info in a String <code>HashMap</code>.
273     * <p>
274     * Possible values are:
275     * <ul>
276     * <li>File name</li>
277     * <li>File size</li>
278     * <li>PDF version</li>
279     * <li>Page count</li>
280     * <li>Page size</li>
281     * <li>Page width</li>
282     * <li>Page height</li>
283     * <li>Page layout</li>
284     * <li>Title</li>
285     * <li>Author</li>
286     * <li>Subject</li>
287     * <li>PDF producer</li>
288     * <li>Content creator</li>
289     * <li>Creation date</li>
290     */
291    public HashMap<String, String> toHashMap() {
292        // Parse if needed
293        run();
294        if (cachedMap == null) {
295            cachedMap = new LinkedHashMap<>();
296            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
297            cachedMap.put("File name", fileName);
298            cachedMap.put("File size", String.valueOf(fileSize));
299            cachedMap.put("PDF version", pdfVersion);
300            cachedMap.put("Page count", String.valueOf(numberOfPages));
301            cachedMap.put("Page size",
302                    String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints));
303            cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints));
304            cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints));
305            cachedMap.put("Page layout", pageLayout);
306            cachedMap.put("Title", title);
307            cachedMap.put("Author", author);
308            cachedMap.put("Subject", subject);
309            cachedMap.put("PDF producer", producer);
310            cachedMap.put("Content creator", contentCreator);
311            if (creationDate != null) {
312                cachedMap.put("Creation date", dateFormat.format(creationDate.getTime()));
313            } else {
314                cachedMap.put("Creation date", "");
315            }
316            if (modificationDate != null) {
317                cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime()));
318            } else {
319                cachedMap.put("Modification date", "");
320            }
321            // "Others"
322            cachedMap.put("Encrypted", String.valueOf(isEncrypted));
323            cachedMap.put("Keywords", keywords);
324            cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints));
325            cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints));
326            cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints));
327            cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints));
328            if(permissions != null) {
329                cachedMap.put("Can Print", String.valueOf(permissions.canPrint()));
330                cachedMap.put("Can Modify", String.valueOf(permissions.canModify()));
331                cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent()));
332                cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations()));
333                cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm()));
334                cachedMap.put("Can Extract for Accessibility", String.valueOf(
335                    permissions.canExtractForAccessibility()));
336                cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument()));
337                cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded()));
338            }
339        }
340        return cachedMap;
341    }
342
343    /**
344     * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value
345     * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example:
346     * <p>
347     * <code><pre>
348     * pdfinfo:title=Title
349     * pdfinfo:producer=PDF Producer
350     * pdfinfo:mediabox_width=Media box width
351     * ...
352     * </pre></code>
353     * <p>
354     * If <code>inSave</code> is false, inSession can be null.
355     *
356     * @param inDoc Input DocumentModel.
357     * @param inMapping Input Mapping.
358     * @param inSave Whether should save.
359     * @param inSession If is saving, should do it in this particular session.
360     */
361    public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave,
362                                  CoreSession inSession) {
363        // Parse if needed
364        run();
365        Map<String, String> values = toHashMap();
366        for (String inXPath : inMapping.keySet()) {
367            String value = values.get(inMapping.get(inXPath));
368            inDoc.setPropertyValue(inXPath, value);
369        }
370        if (inSave) {
371            inDoc = inSession.saveDocument(inDoc);
372        }
373        return inDoc;
374    }
375
376    /**
377     * Wrapper for <code>toHashMap().toString()</code>
378     */
379    @Override
380    public String toString() {
381        return toHashMap().toString();
382    }
383
384    public int getNumberOfPages() {
385        return numberOfPages;
386    }
387
388    public float getMediaBoxWidthInPoints() {
389        return mediaBoxWidthInPoints;
390    }
391
392    public float getMediaBoxHeightInPoints() {
393        return mediaBoxHeightInPoints;
394    }
395
396    public float getCropBoxWidthInPoints() {
397        return cropBoxWidthInPoints;
398    }
399
400    public float getCropBoxHeightInPoints() {
401        return cropBoxHeightInPoints;
402    }
403
404    public long getFileSize() {
405        return fileSize;
406    }
407
408    public boolean isEncrypted() {
409        return isEncrypted;
410    }
411
412    public String getAuthor() {
413        return author;
414    }
415
416    public String getContentCreator() {
417        return contentCreator;
418    }
419
420    public String getFileName() {
421        return fileName;
422    }
423
424    public String getKeywords() {
425        return keywords;
426    }
427
428    public String getPageLayout() {
429        return pageLayout;
430    }
431
432    public String getPdfVersion() {
433        return pdfVersion;
434    }
435
436    public String getProducer() {
437        return producer;
438    }
439
440    public String getSubject() {
441        return subject;
442    }
443
444    public String getTitle() {
445        return title;
446    }
447
448    public String getXmp() {
449        return xmp;
450    }
451
452    public Calendar getCreationDate() {
453        return creationDate;
454    }
455
456    public Calendar getModificationDate() {
457        return modificationDate;
458    }
459
460    public AccessPermission getPermissions() {
461        return permissions;
462    }
463
464}