001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import org.apache.commons.lang.StringUtils;
023import org.apache.pdfbox.exceptions.COSVisitorException;
024import org.apache.pdfbox.pdmodel.PDDocument;
025import org.apache.pdfbox.pdmodel.PDPage;
026import org.apache.pdfbox.util.ImageIOUtil;
027import org.apache.pdfbox.util.PageExtractor;
028import org.nuxeo.ecm.automation.core.util.BlobList;
029import org.nuxeo.ecm.core.api.Blob;
030import org.nuxeo.ecm.core.api.DocumentModel;
031import org.nuxeo.ecm.core.api.NuxeoException;
032import org.nuxeo.ecm.core.api.impl.blob.FileBlob;
033import org.nuxeo.runtime.api.Framework;
034import javax.imageio.ImageIO;
035import java.awt.image.BufferedImage;
036import java.io.File;
037import java.io.FileOutputStream;
038import java.io.IOException;
039import java.util.List;
040
041/**
042 * Extract pages from a PDF.
043 *
044 * @since 8.10
045 */
046public class PDFPageExtractor {
047
048    private Blob pdfBlob;
049
050    private String password;
051
052    public PDFPageExtractor(Blob inBlob) {
053        pdfBlob = inBlob;
054    }
055
056    /**
057     * Constructor with a <code>DocumentModel</code>. Default value for <code>inXPath</code> (if passed
058     * <code>null</code> or ""), is <code>file:content</code>.
059     *
060     * @param inDoc Input DocumentModel.
061     * @param inXPath Input XPath.
062     */
063    public PDFPageExtractor(DocumentModel inDoc, String inXPath) {
064        if (StringUtils.isBlank(inXPath)) {
065            inXPath = "file:content";
066        }
067        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
068    }
069
070    public Blob extract(int inStartPage, int inEndPage) {
071        return extract(inStartPage, inEndPage, null, null, null, null);
072    }
073
074    private String getFileName(Blob blob) {
075        String originalName = blob.getFilename();
076        if (StringUtils.isBlank(originalName)) {
077            return "extracted";
078        } else {
079            int pos = originalName.toLowerCase().lastIndexOf(".pdf");
080            if (pos > 0) {
081                originalName = originalName.substring(0, pos);
082            }
083            return originalName;
084        }
085    }
086
087    /**
088     * Return a Blob built from page <code>inStartPage</code> to <code>inEndPage</code> (inclusive).
089     * <p>
090     * If <code>inEndPage</code> is greater than the number of pages in the source document, it will go to the end of
091     * the document. If <code>inStartPage</code> is less than 1, it'll start with page 1. If <code>inStartPage</code> is
092     * greater than <code>inEndPage</code> or greater than the number of pages in the source document, a blank document
093     * will be returned.
094     * <p>
095     * If fileName is null or "", if is set to the original name + the page range: mydoc.pdf and pages 10-75 +>
096     * mydoc-10-75.pdf.
097     * <p>
098     * The mimetype is always set to "application/pdf".
099     * <p>
100     * Can set the title, subject and author of the resulting PDF. <b>Notice</b>: If the value is null or "", it is just
101     * ignored.
102     *
103     * @param inStartPage Number of first page to be included.
104     * @param inEndPage Number of the last page to be included.
105     * @param inFileName Name of the resulting PDF.
106     * @param inTitle Title of the resulting PDF.
107     * @param inSubject Subject of the resulting PDF.
108     * @param inAuthor Author of the resulting PDF.
109     * @return FileBlob
110     */
111    public Blob extract(int inStartPage, int inEndPage, String inFileName, String inTitle, String inSubject,
112                        String inAuthor) throws NuxeoException {
113        Blob result;
114        PDDocument extracted = null;
115        try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
116            PageExtractor pe = new PageExtractor(pdfDoc, inStartPage, inEndPage);
117            extracted = pe.extract();
118            PDFUtils.setInfos(extracted, inTitle, inSubject, inAuthor);
119            result = PDFUtils.saveInTempFile(extracted);
120            result.setMimeType("application/pdf");
121            if (StringUtils.isBlank(inFileName)) {
122                inFileName = getFileName(pdfBlob) + "-" + inStartPage + "-" + inEndPage + ".pdf";
123            }
124            result.setFilename(inFileName);
125            extracted.close();
126        } catch (IOException | COSVisitorException e) {
127            throw new NuxeoException("Failed to extract the pages", e);
128        }
129        return result;
130    }
131
132    public BlobList getPagesAsImages(String inFileName) throws NuxeoException {
133        ImageIO.scanForPlugins();
134        BlobList results = new BlobList();
135        String resultFileName;
136        // Use file name parameter if passed, otherwise use original file name.
137        if (StringUtils.isBlank(inFileName)) {
138            inFileName = getFileName(pdfBlob) + ".pdf";
139        }
140        try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
141            // Get all PDF pages.
142            List pages = pdfDoc.getDocumentCatalog().getAllPages();
143            // Convert each page to PNG.
144            for (Object pageObject : pages) {
145                PDPage page = (PDPage) pageObject;
146                resultFileName = inFileName + "-" + (pages.indexOf(page) + 1);
147                BufferedImage bim = page.convertToImage(BufferedImage.TYPE_INT_RGB, 300);
148                File resultFile = Framework.createTempFile(resultFileName, ".png");
149                FileOutputStream resultFileStream = new FileOutputStream(resultFile);
150                ImageIOUtil.writeImage(bim, "png", resultFileStream, 300);
151                // Convert each PNG to Nuxeo Blob.
152                FileBlob result = new FileBlob(resultFile);
153                result.setFilename(resultFileName + ".png");
154                result.setMimeType("picture/png");
155                // Add to BlobList.
156                results.add(result);
157                Framework.trackFile(resultFile, result);
158            }
159            pdfDoc.close();
160        } catch (IOException e) {
161            throw new NuxeoException("Failed to extract the pages", e);
162        }
163        return results;
164    }
165
166    public void setPassword(String password) {
167        this.password = password;
168    }
169
170}