Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.BufferedReader;
023import java.io.File;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.text.SimpleDateFormat;
028import java.util.Calendar;
029import java.util.HashMap;
030import java.util.LinkedHashMap;
031import java.util.List;
032import java.util.Locale;
033import java.util.Map;
034import org.apache.pdfbox.exceptions.CryptographyException;
035import org.apache.pdfbox.pdmodel.PDDocument;
036import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
037import org.apache.pdfbox.pdmodel.PDDocumentInformation;
038import org.apache.pdfbox.pdmodel.PDPage;
039import org.apache.pdfbox.pdmodel.common.PDMetadata;
040import org.apache.pdfbox.pdmodel.common.PDRectangle;
041import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
042import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
043import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
044import org.nuxeo.ecm.core.api.Blob;
045import org.nuxeo.ecm.core.api.NuxeoException;
046import org.nuxeo.ecm.core.api.CoreSession;
047import org.nuxeo.ecm.core.api.DocumentModel;
048
049/**
050 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or
051 * <code>toString()</code>) or via individual getters.
052 * <p>
053 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call.
054 * <p>
055 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details.
056 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches.
057 *
058 * @since 8.10
059 */
060public class PDFInfo {
061
062    private Blob pdfBlob;
063
064    private int numberOfPages = -1;
065
066    private float mediaBoxWidthInPoints = 0.0f;
067
068    private float mediaBoxHeightInPoints = 0.0f;
069
070    private float cropBoxWidthInPoints = 0.0f;
071
072    private float cropBoxHeightInPoints = 0.0f;
073
074    private long fileSize = -1;
075
076    private boolean isEncrypted;
077
078    private boolean doXMP = false;
079
080    private boolean alreadyParsed = false;
081
082    private String password;
083
084    private String author = "";
085
086    private String contentCreator = "";
087
088    private String fileName = "";
089
090    private String keywords = "";
091
092    private String pageLayout = "";
093
094    private String pdfVersion = "";
095
096    private String producer = "";
097
098    private String subject = "";
099
100    private String title;
101
102    private String xmp;
103
104    private Calendar creationDate;
105
106    private Calendar modificationDate;
107
108    private AccessPermission permissions;
109
110    private LinkedHashMap<String, String> cachedMap;
111
112    /**
113     * Constructor with a Blob.
114     *
115     * @param inBlob Input blob.
116     */
117    public PDFInfo(Blob inBlob) {
118        this(inBlob, null);
119    }
120
121    /**
122     * Constructor for Blob + encrypted PDF.
123     *
124     * @param inBlob Input blob.
125     * @param inPassword If the PDF is encrypted.
126     */
127    public PDFInfo(Blob inBlob, String inPassword) {
128        pdfBlob = inBlob;
129        password = inPassword;
130        title = "";
131    }
132
133    /**
134     * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the
135     * document.
136     *
137     * @param inDoc Input DocumentModel.
138     */
139    public PDFInfo(DocumentModel inDoc) {
140        this(inDoc, null, null);
141    }
142
143    /**
144     * Constructor for DocumentModel + encrypted PDF
145     * <p>
146     * If <inXPath</code> is <code>null</code> or "", it is set to the default
147     * <code>file:content</code> value.
148     *
149     * @param inDoc Input DocumentModel.
150     * @param inXPath Input XPath.
151     * @param inPassword If the PDF is encrypted.
152     */
153    public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) {
154        if (inXPath == null || inXPath.isEmpty()) {
155            inXPath = "file:content";
156        }
157        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
158        password = inPassword;
159        title = "";
160    }
161
162    /**
163     * If set to true, parsing will extract PDF.
164     * <p>
165     * The value cannot be modified if <code>run()</code> already has been called.
166     *
167     * @param inValue true to extract XMP.
168     */
169    public void setParseWithXMP(boolean inValue) {
170        if (alreadyParsed && doXMP != inValue) {
171            throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed.");
172        }
173        doXMP = inValue;
174    }
175
176    private String checkNotNull(String inValue) {
177        return inValue == null ? "" : inValue;
178    }
179
180    /**
181     * After building the object with the correct constructor, and after possibly having set some parsing property
182     * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF.
183     * <p>
184     * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or
185     * <code>toString()</code>) or individual info (see all getters).
186     *
187     * @throws NuxeoException
188     */
189    public void run() throws NuxeoException {
190        // In case the caller calls several time the run() method
191        if (alreadyParsed) {
192            return;
193        }
194        fileName = pdfBlob.getFilename();
195        File pdfFile = pdfBlob.getFile();
196        fileSize = (pdfFile == null) ? -1 : pdfFile.length();
197        try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) {
198            isEncrypted = pdfDoc.isEncrypted();
199            if (isEncrypted) {
200                pdfDoc.openProtection(new StandardDecryptionMaterial(password));
201            }
202            numberOfPages = pdfDoc.getNumberOfPages();
203            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
204            pageLayout = checkNotNull(docCatalog.getPageLayout());
205            pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion());
206            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
207            author = checkNotNull(docInfo.getAuthor());
208            contentCreator = checkNotNull(docInfo.getCreator());
209            keywords = checkNotNull(docInfo.getKeywords());
210            try {
211                creationDate = docInfo.getCreationDate();
212            } catch (IOException e) {
213                creationDate = null;
214            }
215            try {
216                modificationDate = docInfo.getModificationDate();
217            } catch (IOException e) {
218                modificationDate = null;
219            }
220            producer = checkNotNull(docInfo.getProducer());
221            subject = checkNotNull(docInfo.getSubject());
222            title = checkNotNull(docInfo.getTitle());
223            permissions = pdfDoc.getCurrentAccessPermission();
224            // Getting dimension is a bit tricky
225            mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1;
226            List allPages = docCatalog.getAllPages();
227            boolean gotMediaBox = false, gotCropBox = false;
228            for (Object pageObject : allPages) {
229                PDPage page = (PDPage) pageObject;
230                if (page != null) {
231                    PDRectangle r = page.findMediaBox();
232                    if (r != null) {
233                        mediaBoxWidthInPoints = r.getWidth();
234                        mediaBoxHeightInPoints = r.getHeight();
235                        gotMediaBox = true;
236                    }
237                    r = page.findCropBox();
238                    if (r != null) {
239                        cropBoxWidthInPoints = r.getWidth();
240                        cropBoxHeightInPoints = r.getHeight();
241                        gotCropBox = true;
242                    }
243                }
244                if (gotMediaBox && gotCropBox) {
245                    break;
246                }
247            }
248            if (doXMP) {
249                xmp = null;
250                PDMetadata metadata = docCatalog.getMetadata();
251                if (metadata != null) {
252                    xmp = "";
253                    InputStream xmlInputStream = metadata.createInputStream();
254                    InputStreamReader isr = new InputStreamReader(xmlInputStream);
255                    BufferedReader reader = new BufferedReader(isr);
256                    String line;
257                    do {
258                        line = reader.readLine();
259                        if (line != null) {
260                            xmp += line + "\n";
261                        }
262                    } while (line != null);
263                    reader.close();
264                }
265            }
266            alreadyParsed = true;
267        } catch (IOException | BadSecurityHandlerException | CryptographyException e) {
268            throw new NuxeoException(e);
269        }
270    }
271
272    /**
273     * Return all and every parsed info in a String <code>HashMap</code>.
274     * <p>
275     * Possible values are:
276     * <ul>
277     * <li>File name</li>
278     * <li>File size</li>
279     * <li>PDF version</li>
280     * <li>Page count</li>
281     * <li>Page size</li>
282     * <li>Page width</li>
283     * <li>Page height</li>
284     * <li>Page layout</li>
285     * <li>Title</li>
286     * <li>Author</li>
287     * <li>Subject</li>
288     * <li>PDF producer</li>
289     * <li>Content creator</li>
290     * <li>Creation date</li>
291     */
292    public HashMap<String, String> toHashMap() {
293        // Parse if needed
294        run();
295        if (cachedMap == null) {
296            cachedMap = new LinkedHashMap<>();
297            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
298            cachedMap.put("File name", fileName);
299            cachedMap.put("File size", String.valueOf(fileSize));
300            cachedMap.put("PDF version", pdfVersion);
301            cachedMap.put("Page count", String.valueOf(numberOfPages));
302            cachedMap.put("Page size",
303                    String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints));
304            cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints));
305            cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints));
306            cachedMap.put("Page layout", pageLayout);
307            cachedMap.put("Title", title);
308            cachedMap.put("Author", author);
309            cachedMap.put("Subject", subject);
310            cachedMap.put("PDF producer", producer);
311            cachedMap.put("Content creator", contentCreator);
312            if (creationDate != null) {
313                cachedMap.put("Creation date", dateFormat.format(creationDate.getTime()));
314            } else {
315                cachedMap.put("Creation date", "");
316            }
317            if (modificationDate != null) {
318                cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime()));
319            } else {
320                cachedMap.put("Modification date", "");
321            }
322            // "Others"
323            cachedMap.put("Encrypted", String.valueOf(isEncrypted));
324            cachedMap.put("Keywords", keywords);
325            cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints));
326            cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints));
327            cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints));
328            cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints));
329            if(permissions != null) {
330                cachedMap.put("Can Print", String.valueOf(permissions.canPrint()));
331                cachedMap.put("Can Modify", String.valueOf(permissions.canModify()));
332                cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent()));
333                cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations()));
334                cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm()));
335                cachedMap.put("Can Extract for Accessibility", String.valueOf(
336                    permissions.canExtractForAccessibility()));
337                cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument()));
338                cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded()));
339            }
340        }
341        return cachedMap;
342    }
343
344    /**
345     * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value
346     * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example:
347     * <p>
348     * <code><pre>
349     * pdfinfo:title=Title
350     * pdfinfo:producer=PDF Producer
351     * pdfinfo:mediabox_width=Media box width
352     * ...
353     * </pre></code>
354     * <p>
355     * If <code>inSave</code> is false, inSession can be null.
356     *
357     * @param inDoc Input DocumentModel.
358     * @param inMapping Input Mapping.
359     * @param inSave Whether should save.
360     * @param inSession If is saving, should do it in this particular session.
361     */
362    public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave,
363                                  CoreSession inSession) {
364        // Parse if needed
365        run();
366        Map<String, String> values = toHashMap();
367        for (String inXPath : inMapping.keySet()) {
368            String value = values.get(inMapping.get(inXPath));
369            inDoc.setPropertyValue(inXPath, value);
370        }
371        if (inSave) {
372            inDoc = inSession.saveDocument(inDoc);
373        }
374        return inDoc;
375    }
376
377    /**
378     * Wrapper for <code>toHashMap().toString()</code>
379     */
380    @Override
381    public String toString() {
382        return toHashMap().toString();
383    }
384
385    public int getNumberOfPages() {
386        return numberOfPages;
387    }
388
389    public float getMediaBoxWidthInPoints() {
390        return mediaBoxWidthInPoints;
391    }
392
393    public float getMediaBoxHeightInPoints() {
394        return mediaBoxHeightInPoints;
395    }
396
397    public float getCropBoxWidthInPoints() {
398        return cropBoxWidthInPoints;
399    }
400
401    public float getCropBoxHeightInPoints() {
402        return cropBoxHeightInPoints;
403    }
404
405    public long getFileSize() {
406        return fileSize;
407    }
408
409    public boolean isEncrypted() {
410        return isEncrypted;
411    }
412
413    public String getAuthor() {
414        return author;
415    }
416
417    public String getContentCreator() {
418        return contentCreator;
419    }
420
421    public String getFileName() {
422        return fileName;
423    }
424
425    public String getKeywords() {
426        return keywords;
427    }
428
429    public String getPageLayout() {
430        return pageLayout;
431    }
432
433    public String getPdfVersion() {
434        return pdfVersion;
435    }
436
437    public String getProducer() {
438        return producer;
439    }
440
441    public String getSubject() {
442        return subject;
443    }
444
445    public String getTitle() {
446        return title;
447    }
448
449    public String getXmp() {
450        return xmp;
451    }
452
453    public Calendar getCreationDate() {
454        return creationDate;
455    }
456
457    public Calendar getModificationDate() {
458        return modificationDate;
459    }
460
461    public AccessPermission getPermissions() {
462        return permissions;
463    }
464
465}