001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.BufferedReader; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.text.SimpleDateFormat; 028import java.util.Calendar; 029import java.util.HashMap; 030import java.util.LinkedHashMap; 031import java.util.Locale; 032import java.util.Map; 033 034import org.apache.pdfbox.pdmodel.PDDocument; 035import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 036import org.apache.pdfbox.pdmodel.PDDocumentInformation; 037import org.apache.pdfbox.pdmodel.PDPage; 038import org.apache.pdfbox.pdmodel.common.PDMetadata; 039import org.apache.pdfbox.pdmodel.common.PDRectangle; 040import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 041import org.nuxeo.ecm.core.api.Blob; 042import org.nuxeo.ecm.core.api.CoreSession; 043import org.nuxeo.ecm.core.api.DocumentModel; 044import org.nuxeo.ecm.core.api.NuxeoException; 045 046/** 047 * The class will parse the info embedded in a PDF, and return them either globally (<code>toHashMap()</code> or 048 * <code>toString()</code>) or via individual getters. 049 * <p> 050 * The PDF is parsed only at first call to <code>run()</code>. Values are cached during first call. 051 * <p> 052 * About page sizes, see <a href="http://www.prepressure.com/pdf/basics/page-boxes">PDF page boxes</a> for details. 053 * Here, we get the info from the first page only. The dimensions are in points. Divide by 72 to get it in inches. 054 * 055 * @since 8.10 056 */ 057public class PDFInfo { 058 059 private Blob pdfBlob; 060 061 private int numberOfPages = -1; 062 063 private float mediaBoxWidthInPoints = 0.0f; 064 065 private float mediaBoxHeightInPoints = 0.0f; 066 067 private float cropBoxWidthInPoints = 0.0f; 068 069 private float cropBoxHeightInPoints = 0.0f; 070 071 private long fileSize = -1; 072 073 private boolean isEncrypted; 074 075 private boolean doXMP = false; 076 077 private boolean alreadyParsed = false; 078 079 private String password; 080 081 private String author = ""; 082 083 private String contentCreator = ""; 084 085 private String fileName = ""; 086 087 private String keywords = ""; 088 089 private String pageLayout = ""; 090 091 private String pdfVersion = ""; 092 093 private String producer = ""; 094 095 private String subject = ""; 096 097 private String title; 098 099 private String xmp; 100 101 private Calendar creationDate; 102 103 private Calendar modificationDate; 104 105 private AccessPermission permissions; 106 107 private LinkedHashMap<String, String> cachedMap; 108 109 /** 110 * Constructor with a Blob. 111 * 112 * @param inBlob Input blob. 113 */ 114 public PDFInfo(Blob inBlob) { 115 this(inBlob, null); 116 } 117 118 /** 119 * Constructor for Blob + encrypted PDF. 120 * 121 * @param inBlob Input blob. 122 * @param inPassword If the PDF is encrypted. 123 */ 124 public PDFInfo(Blob inBlob, String inPassword) { 125 pdfBlob = inBlob; 126 password = inPassword; 127 title = ""; 128 } 129 130 /** 131 * Constructor with a DocumentModel. Uses the default <code>file:content</code> xpath to get the blob from the 132 * document. 133 * 134 * @param inDoc Input DocumentModel. 135 */ 136 public PDFInfo(DocumentModel inDoc) { 137 this(inDoc, null, null); 138 } 139 140 /** 141 * Constructor for DocumentModel + encrypted PDF 142 * <p> 143 * If {@code inXPath} is {@code null} or {@code ""}, it is set to the default {@code file:content} value. 144 * 145 * @param inDoc Input DocumentModel. 146 * @param inXPath Input XPath. 147 * @param inPassword If the PDF is encrypted. 148 */ 149 public PDFInfo(DocumentModel inDoc, String inXPath, String inPassword) { 150 if (inXPath == null || inXPath.isEmpty()) { 151 inXPath = "file:content"; 152 } 153 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 154 password = inPassword; 155 title = ""; 156 } 157 158 /** 159 * If set to true, parsing will extract PDF. 160 * <p> 161 * The value cannot be modified if <code>run()</code> already has been called. 162 * 163 * @param inValue true to extract XMP. 164 */ 165 public void setParseWithXMP(boolean inValue) { 166 if (alreadyParsed && doXMP != inValue) { 167 throw new NuxeoException("Value of 'doXML' cannot be modified after the blob has been already parsed."); 168 } 169 doXMP = inValue; 170 } 171 172 private String checkNotNull(String inValue) { 173 return inValue == null ? "" : inValue; 174 } 175 176 /** 177 * After building the object with the correct constructor, and after possibly having set some parsing property 178 * (<code>setParseWithXMP()</code>, for example), this method will extract the information from the PDF. 179 * <p> 180 * After extraction, the info is available through getters: Either all of them (<code>toHashMap()</code> or 181 * <code>toString()</code>) or individual info (see all getters). 182 */ 183 public void run() throws NuxeoException { 184 // In case the caller calls several time the run() method 185 if (alreadyParsed) { 186 return; 187 } 188 fileName = pdfBlob.getFilename(); 189 File pdfFile = pdfBlob.getFile(); 190 fileSize = (pdfFile == null) ? -1 : pdfFile.length(); 191 try (PDDocument pdfDoc = PDDocument.load(pdfBlob.getStream(), password)) { 192 isEncrypted = pdfDoc.isEncrypted(); 193 numberOfPages = pdfDoc.getNumberOfPages(); 194 PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); 195 pageLayout = docCatalog.getPageLayout().stringValue(); 196 pdfVersion = String.valueOf(pdfDoc.getDocument().getVersion()); 197 PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); 198 author = checkNotNull(docInfo.getAuthor()); 199 contentCreator = checkNotNull(docInfo.getCreator()); 200 keywords = checkNotNull(docInfo.getKeywords()); 201 creationDate = docInfo.getCreationDate(); 202 modificationDate = docInfo.getModificationDate(); 203 producer = checkNotNull(docInfo.getProducer()); 204 subject = checkNotNull(docInfo.getSubject()); 205 title = checkNotNull(docInfo.getTitle()); 206 permissions = pdfDoc.getCurrentAccessPermission(); 207 // Getting dimension is a bit tricky 208 mediaBoxWidthInPoints = mediaBoxHeightInPoints = cropBoxWidthInPoints = cropBoxHeightInPoints = -1; 209 boolean gotMediaBox = false, gotCropBox = false; 210 for (PDPage page : docCatalog.getPages()) { 211 if (page != null) { 212 PDRectangle r = page.getMediaBox(); 213 if (r != null) { 214 mediaBoxWidthInPoints = r.getWidth(); 215 mediaBoxHeightInPoints = r.getHeight(); 216 gotMediaBox = true; 217 } 218 r = page.getCropBox(); 219 if (r != null) { 220 cropBoxWidthInPoints = r.getWidth(); 221 cropBoxHeightInPoints = r.getHeight(); 222 gotCropBox = true; 223 } 224 } 225 if (gotMediaBox && gotCropBox) { 226 break; 227 } 228 } 229 if (doXMP) { 230 xmp = null; 231 PDMetadata metadata = docCatalog.getMetadata(); 232 if (metadata != null) { 233 xmp = ""; 234 try (InputStream xmlInputStream = metadata.createInputStream(); // 235 BufferedReader reader = new BufferedReader(new InputStreamReader(xmlInputStream))) { 236 String line; 237 do { 238 line = reader.readLine(); 239 if (line != null) { 240 xmp += line + "\n"; 241 } 242 } while (line != null); 243 } 244 } 245 } 246 alreadyParsed = true; 247 } catch (IOException e) { 248 throw new NuxeoException(e); 249 } 250 } 251 252 /** 253 * Return all and every parsed info in a String <code>HashMap</code>. 254 * <p> 255 * Possible values are: 256 * <ul> 257 * <li>File name</li> 258 * <li>File size</li> 259 * <li>PDF version</li> 260 * <li>Page count</li> 261 * <li>Page size</li> 262 * <li>Page width</li> 263 * <li>Page height</li> 264 * <li>Page layout</li> 265 * <li>Title</li> 266 * <li>Author</li> 267 * <li>Subject</li> 268 * <li>PDF producer</li> 269 * <li>Content creator</li> 270 * <li>Creation date</li> 271 * </ul> 272 */ 273 public HashMap<String, String> toHashMap() { 274 // Parse if needed 275 run(); 276 if (cachedMap == null) { 277 cachedMap = new LinkedHashMap<>(); 278 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 279 cachedMap.put("File name", fileName); 280 cachedMap.put("File size", String.valueOf(fileSize)); 281 cachedMap.put("PDF version", pdfVersion); 282 cachedMap.put("Page count", String.valueOf(numberOfPages)); 283 cachedMap.put("Page size", 284 String.format(Locale.ENGLISH, "%.1f x %.1f points", mediaBoxWidthInPoints, mediaBoxHeightInPoints)); 285 cachedMap.put("Page width", String.valueOf(mediaBoxWidthInPoints)); 286 cachedMap.put("Page height", String.valueOf(mediaBoxHeightInPoints)); 287 cachedMap.put("Page layout", pageLayout); 288 cachedMap.put("Title", title); 289 cachedMap.put("Author", author); 290 cachedMap.put("Subject", subject); 291 cachedMap.put("PDF producer", producer); 292 cachedMap.put("Content creator", contentCreator); 293 if (creationDate != null) { 294 cachedMap.put("Creation date", dateFormat.format(creationDate.getTime())); 295 } else { 296 cachedMap.put("Creation date", ""); 297 } 298 if (modificationDate != null) { 299 cachedMap.put("Modification date", dateFormat.format(modificationDate.getTime())); 300 } else { 301 cachedMap.put("Modification date", ""); 302 } 303 // "Others" 304 cachedMap.put("Encrypted", String.valueOf(isEncrypted)); 305 cachedMap.put("Keywords", keywords); 306 cachedMap.put("Media box width", String.valueOf(mediaBoxWidthInPoints)); 307 cachedMap.put("Media box height", String.valueOf(mediaBoxHeightInPoints)); 308 cachedMap.put("Crop box width", String.valueOf(cropBoxWidthInPoints)); 309 cachedMap.put("Crop box height", String.valueOf(cropBoxHeightInPoints)); 310 if(permissions != null) { 311 cachedMap.put("Can Print", String.valueOf(permissions.canPrint())); 312 cachedMap.put("Can Modify", String.valueOf(permissions.canModify())); 313 cachedMap.put("Can Extract", String.valueOf(permissions.canExtractContent())); 314 cachedMap.put("Can Modify Annotations", String.valueOf(permissions.canModifyAnnotations())); 315 cachedMap.put("Can Fill Forms", String.valueOf(permissions.canFillInForm())); 316 cachedMap.put("Can Extract for Accessibility", String.valueOf( 317 permissions.canExtractForAccessibility())); 318 cachedMap.put("Can Assemble", String.valueOf(permissions.canAssembleDocument())); 319 cachedMap.put("Can Print Degraded", String.valueOf(permissions.canPrintDegraded())); 320 } 321 } 322 return cachedMap; 323 } 324 325 /** 326 * The <code>inMapping</code> map is an HashMap where the key is the xpath of the destination field, and the value 327 * is the exact label of a PDF info as returned by <code>toHashMap()</code>. For example: 328 * <p> 329 * <pre><code> 330 * pdfinfo:title=Title 331 * pdfinfo:producer=PDF Producer 332 * pdfinfo:mediabox_width=Media box width 333 * ... 334 * </code></pre> 335 * <p> 336 * If <code>inSave</code> is false, inSession can be null. 337 * 338 * @param inDoc Input DocumentModel. 339 * @param inMapping Input Mapping. 340 * @param inSave Whether should save. 341 * @param inSession If is saving, should do it in this particular session. 342 */ 343 public DocumentModel toFields(DocumentModel inDoc, HashMap<String, String> inMapping, boolean inSave, 344 CoreSession inSession) { 345 // Parse if needed 346 run(); 347 Map<String, String> values = toHashMap(); 348 for (String inXPath : inMapping.keySet()) { 349 String value = values.get(inMapping.get(inXPath)); 350 inDoc.setPropertyValue(inXPath, value); 351 } 352 if (inSave) { 353 inDoc = inSession.saveDocument(inDoc); 354 } 355 return inDoc; 356 } 357 358 /** 359 * Wrapper for <code>toHashMap().toString()</code> 360 */ 361 @Override 362 public String toString() { 363 return toHashMap().toString(); 364 } 365 366 public int getNumberOfPages() { 367 return numberOfPages; 368 } 369 370 public float getMediaBoxWidthInPoints() { 371 return mediaBoxWidthInPoints; 372 } 373 374 public float getMediaBoxHeightInPoints() { 375 return mediaBoxHeightInPoints; 376 } 377 378 public float getCropBoxWidthInPoints() { 379 return cropBoxWidthInPoints; 380 } 381 382 public float getCropBoxHeightInPoints() { 383 return cropBoxHeightInPoints; 384 } 385 386 public long getFileSize() { 387 return fileSize; 388 } 389 390 public boolean isEncrypted() { 391 return isEncrypted; 392 } 393 394 public String getAuthor() { 395 return author; 396 } 397 398 public String getContentCreator() { 399 return contentCreator; 400 } 401 402 public String getFileName() { 403 return fileName; 404 } 405 406 public String getKeywords() { 407 return keywords; 408 } 409 410 public String getPageLayout() { 411 return pageLayout; 412 } 413 414 public String getPdfVersion() { 415 return pdfVersion; 416 } 417 418 public String getProducer() { 419 return producer; 420 } 421 422 public String getSubject() { 423 return subject; 424 } 425 426 public String getTitle() { 427 return title; 428 } 429 430 public String getXmp() { 431 return xmp; 432 } 433 434 public Calendar getCreationDate() { 435 return creationDate; 436 } 437 438 public Calendar getModificationDate() { 439 return modificationDate; 440 } 441 442 public AccessPermission getPermissions() { 443 return permissions; 444 } 445 446}