001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.text.SimpleDateFormat; 028import java.util.Calendar; 029import java.util.HashMap; 030import java.util.LinkedHashMap; 031import java.util.List; 032import java.util.Map; 033import org.apache.pdfbox.exceptions.CryptographyException; 034import org.apache.pdfbox.pdmodel.PDDocument; 035import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 036import org.apache.pdfbox.pdmodel.PDDocumentInformation; 037import org.apache.pdfbox.pdmodel.PDPage; 038import org.apache.pdfbox.pdmodel.common.PDMetadata; 039import org.apache.pdfbox.pdmodel.common.PDRectangle; 040import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 041import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; 042import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; 043import org.nuxeo.ecm.core.api.Blob; 044import org.nuxeo.ecm.core.api.NuxeoException; 045import org.nuxeo.ecm.core.api.CoreSession; 046import org.nuxeo.ecm.core.api.DocumentModel; 047 048/** 049 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or 050 * <code>toString()</code>) or via individual getters. 051 * <p> 052 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call. 053 * <p> 054 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details. 055 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches. 056 * 057 * @since 8.10 058 */ 059public class PDFInfo { 060 061 private Blob pdfBlob; 062 063 private int numberOfPages = -1; 064 065 private float mediaBoxWidthInPoints = 0.0f; 066 067 private float mediaBoxHeightInPoints = 0.0f; 068 069 private float cropBoxWidthInPoints = 0.0f; 070 071 private float cropBoxHeightInPoints = 0.0f; 072 073 private long fileSize = -1; 074 075 private boolean isEncrypted; 076 077 private boolean doXMP = false; 078 079 private boolean alreadyParsed = false; 080 081 private String password; 082 083 private String author = ""; 084 085 private String contentCreator = ""; 086 087 private String fileName = ""; 088 089 private String keywords = ""; 090 091 private String pageLayout = ""; 092 093 private String pdfVersion = ""; 094 095 private String producer = ""; 096 097 private String subject = ""; 098 099 private String title; 100 101 private String xmp; 102 103 private Calendar creationDate; 104 105 private Calendar modificationDate; 106 107 private AccessPermission permissions; 108 109 private LinkedHashMap<String, String> cachedMap; 110 111 /** 112 * Constructor with a Blob. 113 * 114 * @param inBlob Input blob. 115 */ 116 public PDFInfo(Blob inBlob) { 117 this(inBlob, null); 118 } 119 120 /** 121 * Constructor for Blob + encrypted PDF. 122 * 123 * @param inBlob Input blob. 124 * @param inPassword If the PDF is encrypted. 125 */ 126 public PDFInfo(Blob inBlob, String inPassword) { 127 pdfBlob = inBlob; 128 password = inPassword; 129 title = ""; 130 } 131 132 /** 133 * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the 134 * document. 135 * 136 * @param inDoc Input DocumentModel. 137 */ 138 public PDFInfo(DocumentModel inDoc) { 139 this(inDoc, null, null); 140 } 141 142 /** 143 * Constructor for DocumentModel + encrypted PDF 144 * <p> 145 * If <inXPath</code> is <code>null</code> or "", it is set to the default 146 * <code>file:content</code> value. 147 * 148 * @param inDoc Input DocumentModel. 149 * @param inXPath Input XPath. 150 * @param inPassword If the PDF is encrypted. 151 */ 152 public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) { 153 if (inXPath == null || inXPath.isEmpty()) { 154 inXPath = "file:content"; 155 } 156 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 157 password = inPassword; 158 title = ""; 159 } 160 161 /** 162 * If set to true, parsing will extract PDF. 163 * <p> 164 * The value cannot be modified if <code>run()</code> already has been called. 165 * 166 * @param inValue true to extract XMP. 167 */ 168 public void setParseWithXMP(boolean inValue) { 169 if (alreadyParsed && doXMP != inValue) { 170 throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed."); 171 } 172 doXMP = inValue; 173 } 174 175 private String checkNotNull(String inValue) { 176 return inValue == null ? "" : inValue; 177 } 178 179 /** 180 * After building the object with the correct constructor, and after possibly having set some parsing property 181 * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF. 182 * <p> 183 * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or 184 * <code>toString()</code>) or individual info (see all getters). 185 * 186 * @throws NuxeoException 187 */ 188 public void run() throws NuxeoException { 189 // In case the caller calls several time the run() method 190 if (alreadyParsed) { 191 return; 192 } 193 fileName = pdfBlob.getFilename(); 194 File pdfFile = pdfBlob.getFile(); 195 fileSize = (pdfFile == null) ? -1 : pdfFile.length(); 196 try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) { 197 isEncrypted = pdfDoc.isEncrypted(); 198 if (isEncrypted) { 199 pdfDoc.openProtection(new StandardDecryptionMaterial(password)); 200 } 201 numberOfPages = pdfDoc.getNumberOfPages(); 202 PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); 203 pageLayout = checkNotNull(docCatalog.getPageLayout()); 204 pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion()); 205 PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); 206 author = checkNotNull(docInfo.getAuthor()); 207 contentCreator = checkNotNull(docInfo.getCreator()); 208 keywords = checkNotNull(docInfo.getKeywords()); 209 try { 210 creationDate = docInfo.getCreationDate(); 211 } catch (IOException e) { 212 creationDate = null; 213 } 214 try { 215 modificationDate = docInfo.getModificationDate(); 216 } catch (IOException e) { 217 modificationDate = null; 218 } 219 producer = checkNotNull(docInfo.getProducer()); 220 subject = checkNotNull(docInfo.getSubject()); 221 title = checkNotNull(docInfo.getTitle()); 222 permissions = pdfDoc.getCurrentAccessPermission(); 223 // Getting dimension is a bit tricky 224 mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1; 225 List allPages = docCatalog.getAllPages(); 226 boolean gotMediaBox = false, gotCropBox = false; 227 for (Object pageObject : allPages) { 228 PDPage page = (PDPage) pageObject; 229 if (page != null) { 230 PDRectangle r = page.findMediaBox(); 231 if (r != null) { 232 mediaBoxWidthInPoints = r.getWidth(); 233 mediaBoxHeightInPoints = r.getHeight(); 234 gotMediaBox = true; 235 } 236 r = page.findCropBox(); 237 if (r != null) { 238 cropBoxWidthInPoints = r.getWidth(); 239 cropBoxHeightInPoints = r.getHeight(); 240 gotCropBox = true; 241 } 242 } 243 if (gotMediaBox && gotCropBox) { 244 break; 245 } 246 } 247 if (doXMP) { 248 xmp = null; 249 PDMetadata metadata = docCatalog.getMetadata(); 250 if (metadata != null) { 251 xmp = ""; 252 InputStream xmlInputStream = metadata.createInputStream(); 253 InputStreamReader isr = new InputStreamReader(xmlInputStream); 254 BufferedReader reader = new BufferedReader(isr); 255 String line; 256 do { 257 line = reader.readLine(); 258 if (line != null) { 259 xmp += line + "\n"; 260 } 261 } while (line != null); 262 reader.close(); 263 } 264 } 265 alreadyParsed = true; 266 } catch (IOException | BadSecurityHandlerException | CryptographyException e) { 267 throw new NuxeoException(e); 268 } 269 } 270 271 /** 272 * Return all and every parsed info in a String <code>HashMap</code>. 273 * <p> 274 * Possible values are: 275 * <ul> 276 * <li>File name</li> 277 * <li>File size</li> 278 * <li>PDF version</li> 279 * <li>Page count</li> 280 * <li>Page size</li> 281 * <li>Page width</li> 282 * <li>Page height</li> 283 * <li>Page layout</li> 284 * <li>Title</li> 285 * <li>Author</li> 286 * <li>Subject</li> 287 * <li>PDF producer</li> 288 * <li>Content creator</li> 289 * <li>Creation date</li> 290 */ 291 public HashMap<String, String> toHashMap() { 292 // Parse if needed 293 run(); 294 if (cachedMap == null) { 295 cachedMap = new LinkedHashMap<>(); 296 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 297 cachedMap.put("File name", fileName); 298 cachedMap.put("File size", String.valueOf(fileSize)); 299 cachedMap.put("PDF version", pdfVersion); 300 cachedMap.put("Page count", String.valueOf(numberOfPages)); 301 cachedMap.put("Page size", String.format("%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints)); 302 cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints)); 303 cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints)); 304 cachedMap.put("Page layout", pageLayout); 305 cachedMap.put("Title", title); 306 cachedMap.put("Author", author); 307 cachedMap.put("Subject", subject); 308 cachedMap.put("PDF producer", producer); 309 cachedMap.put("Content creator", contentCreator); 310 if (creationDate != null) { 311 cachedMap.put("Creation date", dateFormat.format(creationDate.getTime())); 312 } else { 313 cachedMap.put("Creation date", ""); 314 } 315 if (modificationDate != null) { 316 cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime())); 317 } else { 318 cachedMap.put("Modification date", ""); 319 } 320 // "Others" 321 cachedMap.put("Encrypted", String.valueOf(isEncrypted)); 322 cachedMap.put("Keywords", keywords); 323 cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints)); 324 cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints)); 325 cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints)); 326 cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints)); 327 if(permissions != null) { 328 cachedMap.put("Can Print", String.valueOf(permissions.canPrint())); 329 cachedMap.put("Can Modify", String.valueOf(permissions.canModify())); 330 cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent())); 331 cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations())); 332 cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm())); 333 cachedMap.put("Can Extract for Accessibility", String.valueOf( 334 permissions.canExtractForAccessibility())); 335 cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument())); 336 cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded())); 337 } 338 } 339 return cachedMap; 340 } 341 342 /** 343 * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value 344 * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example: 345 * <p> 346 * <code><pre> 347 * pdfinfo:title=Title 348 * pdfinfo:producer=PDF Producer 349 * pdfinfo:mediabox_width=Media box width 350 * ... 351 * </pre></code> 352 * <p> 353 * If <code>inSave</code> is false, inSession can be null. 354 * 355 * @param inDoc Input DocumentModel. 356 * @param inMapping Input Mapping. 357 * @param inSave Whether should save. 358 * @param inSession If is saving, should do it in this particular session. 359 */ 360 public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave, 361 CoreSession inSession) { 362 // Parse if needed 363 run(); 364 Map<String, String> values = toHashMap(); 365 for (String inXPath : inMapping.keySet()) { 366 String value = values.get(inMapping.get(inXPath)); 367 inDoc.setPropertyValue(inXPath, value); 368 } 369 if (inSave) { 370 inDoc = inSession.saveDocument(inDoc); 371 } 372 return inDoc; 373 } 374 375 /** 376 * Wrapper for <code>toHashMap().toString()</code> 377 */ 378 @Override 379 public String toString() { 380 return toHashMap().toString(); 381 } 382 383 public int getNumberOfPages() { 384 return numberOfPages; 385 } 386 387 public float getMediaBoxWidthInPoints() { 388 return mediaBoxWidthInPoints; 389 } 390 391 public float getMediaBoxHeightInPoints() { 392 return mediaBoxHeightInPoints; 393 } 394 395 public float getCropBoxWidthInPoints() { 396 return cropBoxWidthInPoints; 397 } 398 399 public float getCropBoxHeightInPoints() { 400 return cropBoxHeightInPoints; 401 } 402 403 public long getFileSize() { 404 return fileSize; 405 } 406 407 public boolean isEncrypted() { 408 return isEncrypted; 409 } 410 411 public String getAuthor() { 412 return author; 413 } 414 415 public String getContentCreator() { 416 return contentCreator; 417 } 418 419 public String getFileName() { 420 return fileName; 421 } 422 423 public String getKeywords() { 424 return keywords; 425 } 426 427 public String getPageLayout() { 428 return pageLayout; 429 } 430 431 public String getPdfVersion() { 432 return pdfVersion; 433 } 434 435 public String getProducer() { 436 return producer; 437 } 438 439 public String getSubject() { 440 return subject; 441 } 442 443 public String getTitle() { 444 return title; 445 } 446 447 public String getXmp() { 448 return xmp; 449 } 450 451 public Calendar getCreationDate() { 452 return creationDate; 453 } 454 455 public Calendar getModificationDate() { 456 return modificationDate; 457 } 458 459 public AccessPermission getPermissions() { 460 return permissions; 461 } 462 463}