001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import org.apache.commons.lang.StringUtils; 023import org.apache.pdfbox.exceptions.COSVisitorException; 024import org.apache.pdfbox.pdmodel.PDDocument; 025import org.apache.pdfbox.pdmodel.PDPage; 026import org.apache.pdfbox.util.ImageIOUtil; 027import org.apache.pdfbox.util.PageExtractor; 028import org.nuxeo.ecm.automation.core.util.BlobList; 029import org.nuxeo.ecm.core.api.Blob; 030import org.nuxeo.ecm.core.api.DocumentModel; 031import org.nuxeo.ecm.core.api.NuxeoException; 032import org.nuxeo.ecm.core.api.impl.blob.FileBlob; 033import org.nuxeo.runtime.api.Framework; 034import javax.imageio.ImageIO; 035import java.awt.image.BufferedImage; 036import java.io.File; 037import java.io.FileOutputStream; 038import java.io.IOException; 039import java.util.List; 040 041/** 042 * Extract pages from a PDF. 043 * 044 * @since 8.10 045 */ 046public class PDFPageExtractor { 047 048 private Blob pdfBlob; 049 050 private String password; 051 052 public PDFPageExtractor(Blob inBlob) { 053 pdfBlob = inBlob; 054 } 055 056 /** 057 * Constructor with a <code>DocumentModel</code>. Default value for <code>inXPath</code> (if passed 058 * <code>null</code> or ""), is <code>file:content</code>. 059 * 060 * @param inDoc Input DocumentModel. 061 * @param inXPath Input XPath. 062 */ 063 public PDFPageExtractor(DocumentModel inDoc, String inXPath) { 064 if (StringUtils.isBlank(inXPath)) { 065 inXPath = "file:content"; 066 } 067 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 068 } 069 070 public Blob extract(int inStartPage, int inEndPage) { 071 return extract(inStartPage, inEndPage, null, null, null, null); 072 } 073 074 private String getFileName(Blob blob) { 075 String originalName = blob.getFilename(); 076 if (StringUtils.isBlank(originalName)) { 077 return "extracted"; 078 } else { 079 int pos = originalName.toLowerCase().lastIndexOf(".pdf"); 080 if (pos > 0) { 081 originalName = originalName.substring(0, pos); 082 } 083 return originalName; 084 } 085 } 086 087 /** 088 * Return a Blob built from page <code>inStartPage</code> to <code>inEndPage</code> (inclusive). 089 * <p> 090 * If <code>inEndPage</code> is greater than the number of pages in the source document, it will go to the end of 091 * the document. If <code>inStartPage</code> is less than 1, it'll start with page 1. If <code>inStartPage</code> is 092 * greater than <code>inEndPage</code> or greater than the number of pages in the source document, a blank document 093 * will be returned. 094 * <p> 095 * If fileName is null or "", if is set to the original name + the page range: mydoc.pdf and pages 10-75 +> 096 * mydoc-10-75.pdf. 097 * <p> 098 * The mimetype is always set to "application/pdf". 099 * <p> 100 * Can set the title, subject and author of the resulting PDF. <b>Notice</b>: If the value is null or "", it is just 101 * ignored. 102 * 103 * @param inStartPage Number of first page to be included. 104 * @param inEndPage Number of the last page to be included. 105 * @param inFileName Name of the resulting PDF. 106 * @param inTitle Title of the resulting PDF. 107 * @param inSubject Subject of the resulting PDF. 108 * @param inAuthor Author of the resulting PDF. 109 * @return FileBlob 110 */ 111 public Blob extract(int inStartPage, int inEndPage, String inFileName, String inTitle, String inSubject, 112 String inAuthor) throws NuxeoException { 113 Blob result; 114 PDDocument extracted = null; 115 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 116 PageExtractor pe = new PageExtractor(pdfDoc, inStartPage, inEndPage); 117 extracted = pe.extract(); 118 PDFUtils.setInfos(extracted, inTitle, inSubject, inAuthor); 119 result = PDFUtils.saveInTempFile(extracted); 120 result.setMimeType("application/pdf"); 121 if (StringUtils.isBlank(inFileName)) { 122 inFileName = getFileName(pdfBlob) + "-" + inStartPage + "-" + inEndPage + ".pdf"; 123 } 124 result.setFilename(inFileName); 125 extracted.close(); 126 } catch (IOException | COSVisitorException e) { 127 throw new NuxeoException("Failed to extract the pages", e); 128 } 129 return result; 130 } 131 132 public BlobList getPagesAsImages(String inFileName) throws NuxeoException { 133 ImageIO.scanForPlugins(); 134 BlobList results = new BlobList(); 135 String resultFileName; 136 // Use file name parameter if passed, otherwise use original file name. 137 if (StringUtils.isBlank(inFileName)) { 138 inFileName = getFileName(pdfBlob) + ".pdf"; 139 } 140 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 141 // Get all PDF pages. 142 List pages = pdfDoc.getDocumentCatalog().getAllPages(); 143 // Convert each page to PNG. 144 for (Object pageObject : pages) { 145 PDPage page = (PDPage) pageObject; 146 resultFileName = inFileName + "-" + (pages.indexOf(page) + 1); 147 BufferedImage bim = page.convertToImage(BufferedImage.TYPE_INT_RGB, 300); 148 File resultFile = Framework.createTempFile(resultFileName, ".png"); 149 FileOutputStream resultFileStream = new FileOutputStream(resultFile); 150 ImageIOUtil.writeImage(bim, "png", resultFileStream, 300); 151 // Convert each PNG to Nuxeo Blob. 152 FileBlob result = new FileBlob(resultFile); 153 result.setFilename(resultFileName + ".png"); 154 result.setMimeType("picture/png"); 155 // Add to BlobList. 156 results.add(result); 157 Framework.trackFile(resultFile, result); 158 } 159 pdfDoc.close(); 160 } catch (IOException e) { 161 throw new NuxeoException("Failed to extract the pages", e); 162 } 163 return results; 164 } 165 166 public void setPassword(String password) { 167 this.password = password; 168 } 169 170}