001/* 002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.awt.image.BufferedImage; 023import java.io.File; 024import java.io.FileOutputStream; 025import java.io.IOException; 026 027import javax.imageio.ImageIO; 028 029import org.apache.commons.lang3.StringUtils; 030import org.apache.pdfbox.multipdf.PageExtractor; 031import org.apache.pdfbox.pdmodel.PDDocument; 032import org.apache.pdfbox.pdmodel.PDPage; 033import org.apache.pdfbox.rendering.ImageType; 034import org.apache.pdfbox.rendering.PDFRenderer; 035import org.apache.pdfbox.tools.imageio.ImageIOUtil; 036import org.nuxeo.ecm.automation.core.util.BlobList; 037import org.nuxeo.ecm.core.api.Blob; 038import org.nuxeo.ecm.core.api.DocumentModel; 039import org.nuxeo.ecm.core.api.NuxeoException; 040import org.nuxeo.ecm.core.api.impl.blob.FileBlob; 041import org.nuxeo.runtime.api.Framework; 042 043/** 044 * Extract pages from a PDF. 045 * 046 * @since 8.10 047 */ 048public class PDFPageExtractor { 049 050 private Blob pdfBlob; 051 052 private String password; 053 054 public PDFPageExtractor(Blob inBlob) { 055 pdfBlob = inBlob; 056 } 057 058 /** 059 * Constructor with a <code>DocumentModel</code>. Default value for <code>inXPath</code> (if passed 060 * <code>null</code> or ""), is <code>file:content</code>. 061 * 062 * @param inDoc Input DocumentModel. 063 * @param inXPath Input XPath. 064 */ 065 public PDFPageExtractor(DocumentModel inDoc, String inXPath) { 066 if (StringUtils.isBlank(inXPath)) { 067 inXPath = "file:content"; 068 } 069 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 070 } 071 072 public Blob extract(int inStartPage, int inEndPage) { 073 return extract(inStartPage, inEndPage, null, null, null, null); 074 } 075 076 private String getFileName(Blob blob) { 077 String originalName = blob.getFilename(); 078 if (StringUtils.isBlank(originalName)) { 079 return "extracted"; 080 } else { 081 int pos = originalName.toLowerCase().lastIndexOf(".pdf"); 082 if (pos > 0) { 083 originalName = originalName.substring(0, pos); 084 } 085 return originalName; 086 } 087 } 088 089 /** 090 * Return a Blob built from page <code>inStartPage</code> to <code>inEndPage</code> (inclusive). 091 * <p> 092 * If <code>inEndPage</code> is greater than the number of pages in the source document, it will go to the end of 093 * the document. If <code>inStartPage</code> is less than 1, it'll start with page 1. If <code>inStartPage</code> is 094 * greater than <code>inEndPage</code> or greater than the number of pages in the source document, a blank document 095 * will be returned. 096 * <p> 097 * If fileName is null or "", if is set to the original name + the page range: mydoc.pdf and pages 10-75 +@gt; 098 * mydoc-10-75.pdf. 099 * <p> 100 * The mimetype is always set to "application/pdf". 101 * <p> 102 * Can set the title, subject and author of the resulting PDF. <b>Notice</b>: If the value is null or "", it is just 103 * ignored. 104 * 105 * @param inStartPage Number of first page to be included. 106 * @param inEndPage Number of the last page to be included. 107 * @param inFileName Name of the resulting PDF. 108 * @param inTitle Title of the resulting PDF. 109 * @param inSubject Subject of the resulting PDF. 110 * @param inAuthor Author of the resulting PDF. 111 * @return FileBlob 112 */ 113 public Blob extract(int inStartPage, int inEndPage, String inFileName, String inTitle, String inSubject, 114 String inAuthor) throws NuxeoException { 115 Blob result; 116 PDDocument extracted; 117 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 118 PageExtractor pe = new PageExtractor(pdfDoc, inStartPage, inEndPage); 119 extracted = pe.extract(); 120 PDFUtils.setInfos(extracted, inTitle, inSubject, inAuthor); 121 result = PDFUtils.saveInTempFile(extracted); 122 result.setMimeType("application/pdf"); 123 if (StringUtils.isBlank(inFileName)) { 124 inFileName = getFileName(pdfBlob) + "-" + inStartPage + "-" + inEndPage + ".pdf"; 125 } 126 result.setFilename(inFileName); 127 extracted.close(); 128 } catch (IOException e) { 129 throw new NuxeoException("Failed to extract the pages", e); 130 } 131 return result; 132 } 133 134 public BlobList getPagesAsImages(String inFileName) throws NuxeoException { 135 ImageIO.scanForPlugins(); 136 BlobList results = new BlobList(); 137 String resultFileName; 138 // Use file name parameter if passed, otherwise use original file name. 139 if (StringUtils.isBlank(inFileName)) { 140 inFileName = getFileName(pdfBlob) + ".pdf"; 141 } 142 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 143 // Get all PDF pages. 144 // Convert each page to PNG. 145 PDFRenderer pdfRenderer = new PDFRenderer(pdfDoc); 146 int pageno = 0; 147 for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) { 148 pageno++; 149 resultFileName = inFileName + "-" + pageno; 150 BufferedImage bim = pdfRenderer.renderImageWithDPI(pageno - 1, 300, ImageType.RGB); 151 File resultFile = Framework.createTempFile(resultFileName, ".png"); 152 FileOutputStream resultFileStream = new FileOutputStream(resultFile); 153 ImageIOUtil.writeImage(bim, "png", resultFileStream, 300); 154 // Convert each PNG to Nuxeo Blob. 155 FileBlob result = new FileBlob(resultFile); 156 result.setFilename(resultFileName + ".png"); 157 result.setMimeType("picture/png"); 158 // Add to BlobList. 159 results.add(result); 160 Framework.trackFile(resultFile, result); 161 } 162 pdfDoc.close(); 163 } catch (IOException e) { 164 throw new NuxeoException("Failed to extract the pages", e); 165 } 166 return results; 167 } 168 169 public void setPassword(String password) { 170 this.password = password; 171 } 172 173}