Source code

001/*
002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.awt.image.BufferedImage;
023import java.io.File;
024import java.io.FileOutputStream;
025import java.io.IOException;
026
027import javax.imageio.ImageIO;
028
029import org.apache.commons.lang3.StringUtils;
030import org.apache.pdfbox.multipdf.PageExtractor;
031import org.apache.pdfbox.pdmodel.PDDocument;
032import org.apache.pdfbox.pdmodel.PDPage;
033import org.apache.pdfbox.rendering.ImageType;
034import org.apache.pdfbox.rendering.PDFRenderer;
035import org.apache.pdfbox.tools.imageio.ImageIOUtil;
036import org.nuxeo.ecm.automation.core.util.BlobList;
037import org.nuxeo.ecm.core.api.Blob;
038import org.nuxeo.ecm.core.api.DocumentModel;
039import org.nuxeo.ecm.core.api.NuxeoException;
040import org.nuxeo.ecm.core.api.impl.blob.FileBlob;
041import org.nuxeo.runtime.api.Framework;
042
043/**
044 * Extract pages from a PDF.
045 *
046 * @since 8.10
047 */
048public class PDFPageExtractor {
049
050    private Blob pdfBlob;
051
052    private String password;
053
054    public PDFPageExtractor(Blob inBlob) {
055        pdfBlob = inBlob;
056    }
057
058    /**
059     * Constructor with a <code>DocumentModel</code>. Default value for <code>inXPath</code> (if passed
060     * <code>null</code> or ""), is <code>file:content</code>.
061     *
062     * @param inDoc Input DocumentModel.
063     * @param inXPath Input XPath.
064     */
065    public PDFPageExtractor(DocumentModel inDoc, String inXPath) {
066        if (StringUtils.isBlank(inXPath)) {
067            inXPath = "file:content";
068        }
069        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
070    }
071
072    public Blob extract(int inStartPage, int inEndPage) {
073        return extract(inStartPage, inEndPage, null, null, null, null);
074    }
075
076    private String getFileName(Blob blob) {
077        String originalName = blob.getFilename();
078        if (StringUtils.isBlank(originalName)) {
079            return "extracted";
080        } else {
081            int pos = originalName.toLowerCase().lastIndexOf(".pdf");
082            if (pos > 0) {
083                originalName = originalName.substring(0, pos);
084            }
085            return originalName;
086        }
087    }
088
089    /**
090     * Return a Blob built from page <code>inStartPage</code> to <code>inEndPage</code> (inclusive).
091     * <p>
092     * If <code>inEndPage</code> is greater than the number of pages in the source document, it will go to the end of
093     * the document. If <code>inStartPage</code> is less than 1, it'll start with page 1. If <code>inStartPage</code> is
094     * greater than <code>inEndPage</code> or greater than the number of pages in the source document, a blank document
095     * will be returned.
096     * <p>
097     * If fileName is null or "", if is set to the original name + the page range: mydoc.pdf and pages 10-75 +@gt;
098     * mydoc-10-75.pdf.
099     * <p>
100     * The mimetype is always set to "application/pdf".
101     * <p>
102     * Can set the title, subject and author of the resulting PDF. <b>Notice</b>: If the value is null or "", it is just
103     * ignored.
104     *
105     * @param inStartPage Number of first page to be included.
106     * @param inEndPage Number of the last page to be included.
107     * @param inFileName Name of the resulting PDF.
108     * @param inTitle Title of the resulting PDF.
109     * @param inSubject Subject of the resulting PDF.
110     * @param inAuthor Author of the resulting PDF.
111     * @return FileBlob
112     */
113    public Blob extract(int inStartPage, int inEndPage, String inFileName, String inTitle, String inSubject,
114                        String inAuthor) throws NuxeoException {
115        Blob result;
116        PDDocument extracted;
117        try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
118            PageExtractor pe = new PageExtractor(pdfDoc, inStartPage, inEndPage);
119            extracted = pe.extract();
120            PDFUtils.setInfos(extracted, inTitle, inSubject, inAuthor);
121            result = PDFUtils.saveInTempFile(extracted);
122            result.setMimeType("application/pdf");
123            if (StringUtils.isBlank(inFileName)) {
124                inFileName = getFileName(pdfBlob) + "-" + inStartPage + "-" + inEndPage + ".pdf";
125            }
126            result.setFilename(inFileName);
127            extracted.close();
128        } catch (IOException e) {
129            throw new NuxeoException("Failed to extract the pages", e);
130        }
131        return result;
132    }
133
134    public BlobList getPagesAsImages(String inFileName) throws NuxeoException {
135        ImageIO.scanForPlugins();
136        BlobList results = new BlobList();
137        String resultFileName;
138        // Use file name parameter if passed, otherwise use original file name.
139        if (StringUtils.isBlank(inFileName)) {
140            inFileName = getFileName(pdfBlob) + ".pdf";
141        }
142        try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
143            // Get all PDF pages.
144            // Convert each page to PNG.
145            PDFRenderer pdfRenderer = new PDFRenderer(pdfDoc);
146            int pageno = 0;
147            for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) {
148                pageno++;
149                resultFileName = inFileName + "-" + pageno;
150                BufferedImage bim = pdfRenderer.renderImageWithDPI(pageno - 1, 300, ImageType.RGB);
151                File resultFile = Framework.createTempFile(resultFileName, ".png");
152                FileOutputStream resultFileStream = new FileOutputStream(resultFile);
153                ImageIOUtil.writeImage(bim, "png", resultFileStream, 300);
154                // Convert each PNG to Nuxeo Blob.
155                FileBlob result = new FileBlob(resultFile);
156                result.setFilename(resultFileName + ".png");
157                result.setMimeType("picture/png");
158                // Add to BlobList.
159                results.add(result);
160                Framework.trackFile(resultFile, result);
161            }
162            pdfDoc.close();
163        } catch (IOException e) {
164            throw new NuxeoException("Failed to extract the pages", e);
165        }
166        return results;
167    }
168
169    public void setPassword(String password) {
170        this.password = password;
171    }
172
173}