001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.awt.geom.Rectangle2D;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.List;
026import org.apache.commons.lang.StringUtils;
027import org.apache.pdfbox.pdmodel.PDDocument;
028import org.apache.pdfbox.pdmodel.PDPage;
029import org.apache.pdfbox.pdmodel.common.PDRectangle;
030import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
031import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
032import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionLaunch;
033import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionRemoteGoTo;
034import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
035import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
037import org.apache.pdfbox.util.PDFTextStripperByArea;
038import org.nuxeo.ecm.core.api.Blob;
039import org.nuxeo.ecm.core.api.NuxeoException;
040
041/**
042 * Extract links as list of {@link LinkInfo} from a PDF.
043 * <p>
044 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF
045 * has a <i>relative</i> link to an external PDF).
046 * <p>
047 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links.
048 * <p>
049 * <b>IMPORTANT</b>
050 * <p>
051 * Because we can parse the documents several times to get different links, we don't close it after every call
052 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks.
053 *
054 * @since 8.10
055 */
056public class PDFLinks {
057
058    private Blob pdfBlob;
059
060    private PDDocument pdfDoc;
061
062    private String password;
063
064    private List<LinkInfo> remoteGoToLinks;
065
066    private List<LinkInfo> launchLinks;
067
068    private List<LinkInfo> uriLinks;
069
070    private PDFTextStripperByArea stripper;
071
072    public PDFLinks(Blob inBlob) {
073        pdfBlob = inBlob;
074    }
075
076    /**
077     * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is
078     * important that the caller explcitly closes it.
079     */
080    public void close() {
081        PDFUtils.closeSilently(pdfDoc);
082        pdfDoc = null;
083        pdfBlob = null;
084        password = null;
085        remoteGoToLinks = null;
086        launchLinks = null;
087        stripper = null;
088    }
089
090    /**
091     * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation
092     * rectangles.
093     */
094    private void loadAndPreflightPdf() throws NuxeoException {
095        if (pdfDoc != null) {
096            return;
097        }
098        pdfDoc = PDFUtils.load(pdfBlob, password);
099        try {
100            stripper = new PDFTextStripperByArea();
101            for (Object pageObject : pdfDoc.getDocumentCatalog().getAllPages()) {
102                PDPage page = (PDPage) pageObject;
103                List pageAnnotations = page.getAnnotations();
104                for (Object annotationObject : pageAnnotations) {
105                    PDAnnotation annot = (PDAnnotation) annotationObject;
106                    if (!(annot instanceof  PDAnnotationLink)) {
107                        continue;
108                    }
109                    PDAnnotationLink link = (PDAnnotationLink) annot;
110                    PDRectangle rect = link.getRectangle();
111                    // need to reposition link rectangle to match text space
112                    float x = rect.getLowerLeftX(), y = rect.getUpperRightY();
113                    float width = rect.getWidth(), height = rect.getHeight();
114                    int rotation = page.findRotation();
115                    if (rotation == 0) {
116                        PDRectangle pageSize = page.findMediaBox();
117                        y = pageSize.getHeight() - y;
118                    }
119                    Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
120                    stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect);
121                }
122            }
123        } catch (IOException e) {
124            throw new NuxeoException("Cannot preflight and prepare regions", e);
125        }
126    }
127
128    /**
129     * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}).
130     *
131     * @throws IOException
132     */
133    public List<LinkInfo> getRemoteGoToLinks() throws IOException {
134        if (remoteGoToLinks == null) {
135            loadAndPreflightPdf();
136            remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE);
137        }
138        return remoteGoToLinks;
139    }
140
141    /**
142     * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}).
143     *
144     * @throws IOException
145     */
146    public List<LinkInfo> getLaunchLinks() throws IOException {
147        if (launchLinks == null) {
148            loadAndPreflightPdf();
149            launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE);
150        }
151        return launchLinks;
152    }
153
154    /**
155     * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}).
156     *
157     * @throws IOException
158     */
159    public List<LinkInfo> getURILinks() throws IOException {
160        if (uriLinks == null) {
161            loadAndPreflightPdf();
162            uriLinks = parseForLinks(PDActionURI.SUB_TYPE);
163        }
164        return uriLinks;
165    }
166
167    private List<LinkInfo> parseForLinks(String inSubType) throws IOException {
168        PDActionRemoteGoTo goTo;
169        PDActionLaunch launch;
170        PDActionURI uri;
171        PDFileSpecification fspec;
172        List<LinkInfo> li = new ArrayList<>();
173        List allPages = pdfDoc.getDocumentCatalog().getAllPages();
174        for (Object pageObject : allPages) {
175            PDPage page = (PDPage) pageObject;
176            stripper.extractRegions(page);
177            List<PDAnnotation> annotations = page.getAnnotations();
178            for (PDAnnotation annot : annotations) {
179                if (!(annot instanceof  PDAnnotationLink)) {
180                    continue;
181                }
182                PDAnnotationLink link = (PDAnnotationLink) annot;
183                PDAction action = link.getAction();
184                if (!action.getSubType().equals(inSubType)) {
185                    continue;
186                }
187                String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot)));
188                String urlValue = null;
189                switch (inSubType) {
190                case PDActionRemoteGoTo.SUB_TYPE:
191                    goTo = (PDActionRemoteGoTo) action;
192                    fspec = goTo.getFile();
193                    urlValue = fspec.getFile();
194                    break;
195                case PDActionLaunch.SUB_TYPE:
196                    launch = (PDActionLaunch) action;
197                    fspec = launch.getFile();
198                    urlValue = fspec.getFile();
199                    break;
200                case PDActionURI.SUB_TYPE:
201                    uri = (PDActionURI) action;
202                    urlValue = uri.getURI();
203                    break;
204                // others...
205                }
206                if (StringUtils.isNotBlank(urlValue)) {
207                    li.add(new LinkInfo(allPages.indexOf(page) + 1, inSubType, urlText, urlValue));
208                }
209            }
210        }
211        return li;
212    }
213
214    public void setPassword(String password) {
215        this.password = password;
216    }
217
218}