001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.text.SimpleDateFormat; 028import java.util.Calendar; 029import java.util.HashMap; 030import java.util.LinkedHashMap; 031import java.util.List; 032import java.util.Locale; 033import java.util.Map; 034import org.apache.pdfbox.exceptions.CryptographyException; 035import org.apache.pdfbox.pdmodel.PDDocument; 036import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 037import org.apache.pdfbox.pdmodel.PDDocumentInformation; 038import org.apache.pdfbox.pdmodel.PDPage; 039import org.apache.pdfbox.pdmodel.common.PDMetadata; 040import org.apache.pdfbox.pdmodel.common.PDRectangle; 041import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 042import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; 043import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; 044import org.nuxeo.ecm.core.api.Blob; 045import org.nuxeo.ecm.core.api.NuxeoException; 046import org.nuxeo.ecm.core.api.CoreSession; 047import org.nuxeo.ecm.core.api.DocumentModel; 048 049/** 050 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or 051 * <code>toString()</code>) or via individual getters. 052 * <p> 053 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call. 054 * <p> 055 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details. 056 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches. 057 * 058 * @since 8.10 059 */ 060public class PDFInfo { 061 062 private Blob pdfBlob; 063 064 private int numberOfPages = -1; 065 066 private float mediaBoxWidthInPoints = 0.0f; 067 068 private float mediaBoxHeightInPoints = 0.0f; 069 070 private float cropBoxWidthInPoints = 0.0f; 071 072 private float cropBoxHeightInPoints = 0.0f; 073 074 private long fileSize = -1; 075 076 private boolean isEncrypted; 077 078 private boolean doXMP = false; 079 080 private boolean alreadyParsed = false; 081 082 private String password; 083 084 private String author = ""; 085 086 private String contentCreator = ""; 087 088 private String fileName = ""; 089 090 private String keywords = ""; 091 092 private String pageLayout = ""; 093 094 private String pdfVersion = ""; 095 096 private String producer = ""; 097 098 private String subject = ""; 099 100 private String title; 101 102 private String xmp; 103 104 private Calendar creationDate; 105 106 private Calendar modificationDate; 107 108 private AccessPermission permissions; 109 110 private LinkedHashMap<String, String> cachedMap; 111 112 /** 113 * Constructor with a Blob. 114 * 115 * @param inBlob Input blob. 116 */ 117 public PDFInfo(Blob inBlob) { 118 this(inBlob, null); 119 } 120 121 /** 122 * Constructor for Blob + encrypted PDF. 123 * 124 * @param inBlob Input blob. 125 * @param inPassword If the PDF is encrypted. 126 */ 127 public PDFInfo(Blob inBlob, String inPassword) { 128 pdfBlob = inBlob; 129 password = inPassword; 130 title = ""; 131 } 132 133 /** 134 * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the 135 * document. 136 * 137 * @param inDoc Input DocumentModel. 138 */ 139 public PDFInfo(DocumentModel inDoc) { 140 this(inDoc, null, null); 141 } 142 143 /** 144 * Constructor for DocumentModel + encrypted PDF 145 * <p> 146 * If <inXPath</code> is <code>null</code> or "", it is set to the default 147 * <code>file:content</code> value. 148 * 149 * @param inDoc Input DocumentModel. 150 * @param inXPath Input XPath. 151 * @param inPassword If the PDF is encrypted. 152 */ 153 public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) { 154 if (inXPath == null || inXPath.isEmpty()) { 155 inXPath = "file:content"; 156 } 157 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 158 password = inPassword; 159 title = ""; 160 } 161 162 /** 163 * If set to true, parsing will extract PDF. 164 * <p> 165 * The value cannot be modified if <code>run()</code> already has been called. 166 * 167 * @param inValue true to extract XMP. 168 */ 169 public void setParseWithXMP(boolean inValue) { 170 if (alreadyParsed && doXMP != inValue) { 171 throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed."); 172 } 173 doXMP = inValue; 174 } 175 176 private String checkNotNull(String inValue) { 177 return inValue == null ? "" : inValue; 178 } 179 180 /** 181 * After building the object with the correct constructor, and after possibly having set some parsing property 182 * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF. 183 * <p> 184 * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or 185 * <code>toString()</code>) or individual info (see all getters). 186 * 187 * @throws NuxeoException 188 */ 189 public void run() throws NuxeoException { 190 // In case the caller calls several time the run() method 191 if (alreadyParsed) { 192 return; 193 } 194 fileName = pdfBlob.getFilename(); 195 File pdfFile = pdfBlob.getFile(); 196 fileSize = (pdfFile == null) ? -1 : pdfFile.length(); 197 try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) { 198 isEncrypted = pdfDoc.isEncrypted(); 199 if (isEncrypted) { 200 pdfDoc.openProtection(new StandardDecryptionMaterial(password)); 201 } 202 numberOfPages = pdfDoc.getNumberOfPages(); 203 PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); 204 pageLayout = checkNotNull(docCatalog.getPageLayout()); 205 pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion()); 206 PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); 207 author = checkNotNull(docInfo.getAuthor()); 208 contentCreator = checkNotNull(docInfo.getCreator()); 209 keywords = checkNotNull(docInfo.getKeywords()); 210 try { 211 creationDate = docInfo.getCreationDate(); 212 } catch (IOException e) { 213 creationDate = null; 214 } 215 try { 216 modificationDate = docInfo.getModificationDate(); 217 } catch (IOException e) { 218 modificationDate = null; 219 } 220 producer = checkNotNull(docInfo.getProducer()); 221 subject = checkNotNull(docInfo.getSubject()); 222 title = checkNotNull(docInfo.getTitle()); 223 permissions = pdfDoc.getCurrentAccessPermission(); 224 // Getting dimension is a bit tricky 225 mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1; 226 List allPages = docCatalog.getAllPages(); 227 boolean gotMediaBox = false, gotCropBox = false; 228 for (Object pageObject : allPages) { 229 PDPage page = (PDPage) pageObject; 230 if (page != null) { 231 PDRectangle r = page.findMediaBox(); 232 if (r != null) { 233 mediaBoxWidthInPoints = r.getWidth(); 234 mediaBoxHeightInPoints = r.getHeight(); 235 gotMediaBox = true; 236 } 237 r = page.findCropBox(); 238 if (r != null) { 239 cropBoxWidthInPoints = r.getWidth(); 240 cropBoxHeightInPoints = r.getHeight(); 241 gotCropBox = true; 242 } 243 } 244 if (gotMediaBox && gotCropBox) { 245 break; 246 } 247 } 248 if (doXMP) { 249 xmp = null; 250 PDMetadata metadata = docCatalog.getMetadata(); 251 if (metadata != null) { 252 xmp = ""; 253 InputStream xmlInputStream = metadata.createInputStream(); 254 InputStreamReader isr = new InputStreamReader(xmlInputStream); 255 BufferedReader reader = new BufferedReader(isr); 256 String line; 257 do { 258 line = reader.readLine(); 259 if (line != null) { 260 xmp += line + "\n"; 261 } 262 } while (line != null); 263 reader.close(); 264 } 265 } 266 alreadyParsed = true; 267 } catch (IOException | BadSecurityHandlerException | CryptographyException e) { 268 throw new NuxeoException(e); 269 } 270 } 271 272 /** 273 * Return all and every parsed info in a String <code>HashMap</code>. 274 * <p> 275 * Possible values are: 276 * <ul> 277 * <li>File name</li> 278 * <li>File size</li> 279 * <li>PDF version</li> 280 * <li>Page count</li> 281 * <li>Page size</li> 282 * <li>Page width</li> 283 * <li>Page height</li> 284 * <li>Page layout</li> 285 * <li>Title</li> 286 * <li>Author</li> 287 * <li>Subject</li> 288 * <li>PDF producer</li> 289 * <li>Content creator</li> 290 * <li>Creation date</li> 291 */ 292 public HashMap<String, String> toHashMap() { 293 // Parse if needed 294 run(); 295 if (cachedMap == null) { 296 cachedMap = new LinkedHashMap<>(); 297 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 298 cachedMap.put("File name", fileName); 299 cachedMap.put("File size", String.valueOf(fileSize)); 300 cachedMap.put("PDF version", pdfVersion); 301 cachedMap.put("Page count", String.valueOf(numberOfPages)); 302 cachedMap.put("Page size", 303 String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints)); 304 cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints)); 305 cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints)); 306 cachedMap.put("Page layout", pageLayout); 307 cachedMap.put("Title", title); 308 cachedMap.put("Author", author); 309 cachedMap.put("Subject", subject); 310 cachedMap.put("PDF producer", producer); 311 cachedMap.put("Content creator", contentCreator); 312 if (creationDate != null) { 313 cachedMap.put("Creation date", dateFormat.format(creationDate.getTime())); 314 } else { 315 cachedMap.put("Creation date", ""); 316 } 317 if (modificationDate != null) { 318 cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime())); 319 } else { 320 cachedMap.put("Modification date", ""); 321 } 322 // "Others" 323 cachedMap.put("Encrypted", String.valueOf(isEncrypted)); 324 cachedMap.put("Keywords", keywords); 325 cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints)); 326 cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints)); 327 cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints)); 328 cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints)); 329 if(permissions != null) { 330 cachedMap.put("Can Print", String.valueOf(permissions.canPrint())); 331 cachedMap.put("Can Modify", String.valueOf(permissions.canModify())); 332 cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent())); 333 cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations())); 334 cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm())); 335 cachedMap.put("Can Extract for Accessibility", String.valueOf( 336 permissions.canExtractForAccessibility())); 337 cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument())); 338 cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded())); 339 } 340 } 341 return cachedMap; 342 } 343 344 /** 345 * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value 346 * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example: 347 * <p> 348 * <code><pre> 349 * pdfinfo:title=Title 350 * pdfinfo:producer=PDF Producer 351 * pdfinfo:mediabox_width=Media box width 352 * ... 353 * </pre></code> 354 * <p> 355 * If <code>inSave</code> is false, inSession can be null. 356 * 357 * @param inDoc Input DocumentModel. 358 * @param inMapping Input Mapping. 359 * @param inSave Whether should save. 360 * @param inSession If is saving, should do it in this particular session. 361 */ 362 public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave, 363 CoreSession inSession) { 364 // Parse if needed 365 run(); 366 Map<String, String> values = toHashMap(); 367 for (String inXPath : inMapping.keySet()) { 368 String value = values.get(inMapping.get(inXPath)); 369 inDoc.setPropertyValue(inXPath, value); 370 } 371 if (inSave) { 372 inDoc = inSession.saveDocument(inDoc); 373 } 374 return inDoc; 375 } 376 377 /** 378 * Wrapper for <code>toHashMap().toString()</code> 379 */ 380 @Override 381 public String toString() { 382 return toHashMap().toString(); 383 } 384 385 public int getNumberOfPages() { 386 return numberOfPages; 387 } 388 389 public float getMediaBoxWidthInPoints() { 390 return mediaBoxWidthInPoints; 391 } 392 393 public float getMediaBoxHeightInPoints() { 394 return mediaBoxHeightInPoints; 395 } 396 397 public float getCropBoxWidthInPoints() { 398 return cropBoxWidthInPoints; 399 } 400 401 public float getCropBoxHeightInPoints() { 402 return cropBoxHeightInPoints; 403 } 404 405 public long getFileSize() { 406 return fileSize; 407 } 408 409 public boolean isEncrypted() { 410 return isEncrypted; 411 } 412 413 public String getAuthor() { 414 return author; 415 } 416 417 public String getContentCreator() { 418 return contentCreator; 419 } 420 421 public String getFileName() { 422 return fileName; 423 } 424 425 public String getKeywords() { 426 return keywords; 427 } 428 429 public String getPageLayout() { 430 return pageLayout; 431 } 432 433 public String getPdfVersion() { 434 return pdfVersion; 435 } 436 437 public String getProducer() { 438 return producer; 439 } 440 441 public String getSubject() { 442 return subject; 443 } 444 445 public String getTitle() { 446 return title; 447 } 448 449 public String getXmp() { 450 return xmp; 451 } 452 453 public Calendar getCreationDate() { 454 return creationDate; 455 } 456 457 public Calendar getModificationDate() { 458 return modificationDate; 459 } 460 461 public AccessPermission getPermissions() { 462 return permissions; 463 } 464 465}