001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.text.SimpleDateFormat; 028import java.util.Calendar; 029import java.util.HashMap; 030import java.util.LinkedHashMap; 031import java.util.List; 032import java.util.Locale; 033import java.util.Map; 034import org.apache.pdfbox.exceptions.CryptographyException; 035import org.apache.pdfbox.pdmodel.PDDocument; 036import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 037import org.apache.pdfbox.pdmodel.PDDocumentInformation; 038import org.apache.pdfbox.pdmodel.PDPage; 039import org.apache.pdfbox.pdmodel.common.PDMetadata; 040import org.apache.pdfbox.pdmodel.common.PDRectangle; 041import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 042import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; 043import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; 044import org.nuxeo.ecm.core.api.Blob; 045import org.nuxeo.ecm.core.api.NuxeoException; 046import org.nuxeo.ecm.core.api.CoreSession; 047import org.nuxeo.ecm.core.api.DocumentModel; 048 049/** 050 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or 051 * <code>toString()</code>) or via individual getters. 052 * <p> 053 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call. 054 * <p> 055 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details. 056 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches. 057 * 058 * @since 8.10 059 */ 060public class PDFInfo { 061 062 private Blob pdfBlob; 063 064 private int numberOfPages = -1; 065 066 private float mediaBoxWidthInPoints = 0.0f; 067 068 private float mediaBoxHeightInPoints = 0.0f; 069 070 private float cropBoxWidthInPoints = 0.0f; 071 072 private float cropBoxHeightInPoints = 0.0f; 073 074 private long fileSize = -1; 075 076 private boolean isEncrypted; 077 078 private boolean doXMP = false; 079 080 private boolean alreadyParsed = false; 081 082 private String password; 083 084 private String author = ""; 085 086 private String contentCreator = ""; 087 088 private String fileName = ""; 089 090 private String keywords = ""; 091 092 private String pageLayout = ""; 093 094 private String pdfVersion = ""; 095 096 private String producer = ""; 097 098 private String subject = ""; 099 100 private String title; 101 102 private String xmp; 103 104 private Calendar creationDate; 105 106 private Calendar modificationDate; 107 108 private AccessPermission permissions; 109 110 private LinkedHashMap<String, String> cachedMap; 111 112 /** 113 * Constructor with a Blob. 114 * 115 * @param inBlob Input blob. 116 */ 117 public PDFInfo(Blob inBlob) { 118 this(inBlob, null); 119 } 120 121 /** 122 * Constructor for Blob + encrypted PDF. 123 * 124 * @param inBlob Input blob. 125 * @param inPassword If the PDF is encrypted. 126 */ 127 public PDFInfo(Blob inBlob, String inPassword) { 128 pdfBlob = inBlob; 129 password = inPassword; 130 title = ""; 131 } 132 133 /** 134 * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the 135 * document. 136 * 137 * @param inDoc Input DocumentModel. 138 */ 139 public PDFInfo(DocumentModel inDoc) { 140 this(inDoc, null, null); 141 } 142 143 /** 144 * Constructor for DocumentModel + encrypted PDF 145 * <p> 146 * If <inXPath</code> is <code>null</code> or "", it is set to the default 147 * <code>file:content</code> value. 148 * 149 * @param inDoc Input DocumentModel. 150 * @param inXPath Input XPath. 151 * @param inPassword If the PDF is encrypted. 152 */ 153 public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) { 154 if (inXPath == null || inXPath.isEmpty()) { 155 inXPath = "file:content"; 156 } 157 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 158 password = inPassword; 159 title = ""; 160 } 161 162 /** 163 * If set to true, parsing will extract PDF. 164 * <p> 165 * The value cannot be modified if <code>run()</code> already has been called. 166 * 167 * @param inValue true to extract XMP. 168 */ 169 public void setParseWithXMP(boolean inValue) { 170 if (alreadyParsed && doXMP != inValue) { 171 throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed."); 172 } 173 doXMP = inValue; 174 } 175 176 private String checkNotNull(String inValue) { 177 return inValue == null ? "" : inValue; 178 } 179 180 /** 181 * After building the object with the correct constructor, and after possibly having set some parsing property 182 * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF. 183 * <p> 184 * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or 185 * <code>toString()</code>) or individual info (see all getters). 186 * 187 * @throws NuxeoException 188 */ 189 public void run() throws NuxeoException { 190 // In case the caller calls several time the run() method 191 if (alreadyParsed) { 192 return; 193 } 194 fileName = pdfBlob.getFilename(); 195 File pdfFile = pdfBlob.getFile(); 196 fileSize = (pdfFile == null) ? -1 : pdfFile.length(); 197 try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream())) { 198 isEncrypted = pdfDoc.isEncrypted(); 199 if (isEncrypted) { 200 pdfDoc.openProtection(new StandardDecryptionMaterial(password)); 201 } 202 numberOfPages = pdfDoc.getNumberOfPages(); 203 PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); 204 pageLayout = checkNotNull(docCatalog.getPageLayout()); 205 pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion()); 206 PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); 207 author = checkNotNull(docInfo.getAuthor()); 208 contentCreator = checkNotNull(docInfo.getCreator()); 209 keywords = checkNotNull(docInfo.getKeywords()); 210 try { 211 creationDate = docInfo.getCreationDate(); 212 } catch (IOException e) { 213 creationDate = null; 214 } 215 try { 216 modificationDate = docInfo.getModificationDate(); 217 } catch (IOException e) { 218 modificationDate = null; 219 } 220 producer = checkNotNull(docInfo.getProducer()); 221 subject = checkNotNull(docInfo.getSubject()); 222 title = checkNotNull(docInfo.getTitle()); 223 permissions = pdfDoc.getCurrentAccessPermission(); 224 // Getting dimension is a bit tricky 225 mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1; 226 List allPages = docCatalog.getAllPages(); 227 boolean gotMediaBox = false, gotCropBox = false; 228 for (Object pageObject : allPages) { 229 PDPage page = (PDPage) pageObject; 230 if (page != null) { 231 PDRectangle r = page.findMediaBox(); 232 if (r != null) { 233 mediaBoxWidthInPoints = r.getWidth(); 234 mediaBoxHeightInPoints = r.getHeight(); 235 gotMediaBox = true; 236 } 237 r = page.findCropBox(); 238 if (r != null) { 239 cropBoxWidthInPoints = r.getWidth(); 240 cropBoxHeightInPoints = r.getHeight(); 241 gotCropBox = true; 242 } 243 } 244 if (gotMediaBox && gotCropBox) { 245 break; 246 } 247 } 248 if (doXMP) { 249 xmp = null; 250 PDMetadata metadata = docCatalog.getMetadata(); 251 if (metadata != null) { 252 xmp = ""; 253 try (InputStream xmlInputStream = metadata.createInputStream(); // 254 BufferedReader reader = new BufferedReader(new InputStreamReader(xmlInputStream))) { 255 String line; 256 do { 257 line = reader.readLine(); 258 if (line != null) { 259 xmp += line + "\n"; 260 } 261 } while (line != null); 262 } 263 } 264 } 265 alreadyParsed = true; 266 } catch (IOException | BadSecurityHandlerException | CryptographyException e) { 267 throw new NuxeoException(e); 268 } 269 } 270 271 /** 272 * Return all and every parsed info in a String <code>HashMap</code>. 273 * <p> 274 * Possible values are: 275 * <ul> 276 * <li>File name</li> 277 * <li>File size</li> 278 * <li>PDF version</li> 279 * <li>Page count</li> 280 * <li>Page size</li> 281 * <li>Page width</li> 282 * <li>Page height</li> 283 * <li>Page layout</li> 284 * <li>Title</li> 285 * <li>Author</li> 286 * <li>Subject</li> 287 * <li>PDF producer</li> 288 * <li>Content creator</li> 289 * <li>Creation date</li> 290 */ 291 public HashMap<String, String> toHashMap() { 292 // Parse if needed 293 run(); 294 if (cachedMap == null) { 295 cachedMap = new LinkedHashMap<>(); 296 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 297 cachedMap.put("File name", fileName); 298 cachedMap.put("File size", String.valueOf(fileSize)); 299 cachedMap.put("PDF version", pdfVersion); 300 cachedMap.put("Page count", String.valueOf(numberOfPages)); 301 cachedMap.put("Page size", 302 String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints)); 303 cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints)); 304 cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints)); 305 cachedMap.put("Page layout", pageLayout); 306 cachedMap.put("Title", title); 307 cachedMap.put("Author", author); 308 cachedMap.put("Subject", subject); 309 cachedMap.put("PDF producer", producer); 310 cachedMap.put("Content creator", contentCreator); 311 if (creationDate != null) { 312 cachedMap.put("Creation date", dateFormat.format(creationDate.getTime())); 313 } else { 314 cachedMap.put("Creation date", ""); 315 } 316 if (modificationDate != null) { 317 cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime())); 318 } else { 319 cachedMap.put("Modification date", ""); 320 } 321 // "Others" 322 cachedMap.put("Encrypted", String.valueOf(isEncrypted)); 323 cachedMap.put("Keywords", keywords); 324 cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints)); 325 cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints)); 326 cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints)); 327 cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints)); 328 if(permissions != null) { 329 cachedMap.put("Can Print", String.valueOf(permissions.canPrint())); 330 cachedMap.put("Can Modify", String.valueOf(permissions.canModify())); 331 cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent())); 332 cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations())); 333 cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm())); 334 cachedMap.put("Can Extract for Accessibility", String.valueOf( 335 permissions.canExtractForAccessibility())); 336 cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument())); 337 cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded())); 338 } 339 } 340 return cachedMap; 341 } 342 343 /** 344 * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value 345 * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example: 346 * <p> 347 * <code><pre> 348 * pdfinfo:title=Title 349 * pdfinfo:producer=PDF Producer 350 * pdfinfo:mediabox_width=Media box width 351 * ... 352 * </pre></code> 353 * <p> 354 * If <code>inSave</code> is false, inSession can be null. 355 * 356 * @param inDoc Input DocumentModel. 357 * @param inMapping Input Mapping. 358 * @param inSave Whether should save. 359 * @param inSession If is saving, should do it in this particular session. 360 */ 361 public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave, 362 CoreSession inSession) { 363 // Parse if needed 364 run(); 365 Map<String, String> values = toHashMap(); 366 for (String inXPath : inMapping.keySet()) { 367 String value = values.get(inMapping.get(inXPath)); 368 inDoc.setPropertyValue(inXPath, value); 369 } 370 if (inSave) { 371 inDoc = inSession.saveDocument(inDoc); 372 } 373 return inDoc; 374 } 375 376 /** 377 * Wrapper for <code>toHashMap().toString()</code> 378 */ 379 @Override 380 public String toString() { 381 return toHashMap().toString(); 382 } 383 384 public int getNumberOfPages() { 385 return numberOfPages; 386 } 387 388 public float getMediaBoxWidthInPoints() { 389 return mediaBoxWidthInPoints; 390 } 391 392 public float getMediaBoxHeightInPoints() { 393 return mediaBoxHeightInPoints; 394 } 395 396 public float getCropBoxWidthInPoints() { 397 return cropBoxWidthInPoints; 398 } 399 400 public float getCropBoxHeightInPoints() { 401 return cropBoxHeightInPoints; 402 } 403 404 public long getFileSize() { 405 return fileSize; 406 } 407 408 public boolean isEncrypted() { 409 return isEncrypted; 410 } 411 412 public String getAuthor() { 413 return author; 414 } 415 416 public String getContentCreator() { 417 return contentCreator; 418 } 419 420 public String getFileName() { 421 return fileName; 422 } 423 424 public String getKeywords() { 425 return keywords; 426 } 427 428 public String getPageLayout() { 429 return pageLayout; 430 } 431 432 public String getPdfVersion() { 433 return pdfVersion; 434 } 435 436 public String getProducer() { 437 return producer; 438 } 439 440 public String getSubject() { 441 return subject; 442 } 443 444 public String getTitle() { 445 return title; 446 } 447 448 public String getXmp() { 449 return xmp; 450 } 451 452 public Calendar getCreationDate() { 453 return creationDate; 454 } 455 456 public Calendar getModificationDate() { 457 return modificationDate; 458 } 459 460 public AccessPermission getPermissions() { 461 return permissions; 462 } 463 464}