001/*
002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.awt.geom.Rectangle2D;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.List;
026
027import org.apache.commons.lang3.StringUtils;
028import org.apache.pdfbox.pdmodel.PDDocument;
029import org.apache.pdfbox.pdmodel.PDPage;
030import org.apache.pdfbox.pdmodel.common.PDRectangle;
031import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
032import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
033import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
034import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
035import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
037import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
038import org.apache.pdfbox.text.PDFTextStripperByArea;
039import org.nuxeo.ecm.core.api.Blob;
040import org.nuxeo.ecm.core.api.NuxeoException;
041
042/**
043 * Extract links as list of {@link LinkInfo} from a PDF.
044 * <p>
045 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF
046 * has a <i>relative</i> link to an external PDF).
047 * <p>
048 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links.
049 * <p>
050 * <b>IMPORTANT</b>
051 * <p>
052 * Because we can parse the documents several times to get different links, we don't close it after every call
053 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks.
054 *
055 * @since 8.10
056 */
057public class PDFLinks {
058
059    private Blob pdfBlob;
060
061    private PDDocument pdfDoc;
062
063    private String password;
064
065    private List<LinkInfo> remoteGoToLinks;
066
067    private List<LinkInfo> launchLinks;
068
069    private List<LinkInfo> uriLinks;
070
071    private PDFTextStripperByArea stripper;
072
073    public PDFLinks(Blob inBlob) {
074        pdfBlob = inBlob;
075    }
076
077    /**
078     * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is
079     * important that the caller explcitly closes it.
080     */
081    public void close() {
082        PDFUtils.closeSilently(pdfDoc);
083        pdfDoc = null;
084        pdfBlob = null;
085        password = null;
086        remoteGoToLinks = null;
087        launchLinks = null;
088        stripper = null;
089    }
090
091    /**
092     * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation
093     * rectangles.
094     */
095    private void loadAndPreflightPdf() throws NuxeoException {
096        if (pdfDoc != null) {
097            return;
098        }
099        pdfDoc = PDFUtils.load(pdfBlob, password);
100        try {
101            stripper = new PDFTextStripperByArea();
102            for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) {
103                List<?> pageAnnotations = page.getAnnotations();
104                for (Object annotationObject : pageAnnotations) {
105                    PDAnnotation annot = (PDAnnotation) annotationObject;
106                    if (!(annot instanceof PDAnnotationLink)) {
107                        continue;
108                    }
109                    PDAnnotationLink link = (PDAnnotationLink) annot;
110                    PDRectangle rect = link.getRectangle();
111                    // need to reposition link rectangle to match text space
112                    float x = rect.getLowerLeftX(), y = rect.getUpperRightY();
113                    float width = rect.getWidth(), height = rect.getHeight();
114                    int rotation = page.getRotation();
115                    if (rotation == 0) {
116                        PDRectangle pageSize = page.getMediaBox();
117                        y = pageSize.getHeight() - y;
118                    }
119                    Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
120                    stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect);
121                }
122            }
123        } catch (IOException e) {
124            throw new NuxeoException("Cannot preflight and prepare regions", e);
125        }
126    }
127
128    /**
129     * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}).
130     */
131    public List<LinkInfo> getRemoteGoToLinks() throws IOException {
132        if (remoteGoToLinks == null) {
133            loadAndPreflightPdf();
134            remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE);
135        }
136        return remoteGoToLinks;
137    }
138
139    /**
140     * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}).
141     */
142    public List<LinkInfo> getLaunchLinks() throws IOException {
143        if (launchLinks == null) {
144            loadAndPreflightPdf();
145            launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE);
146        }
147        return launchLinks;
148    }
149
150    /**
151     * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}).
152     */
153    public List<LinkInfo> getURILinks() throws IOException {
154        if (uriLinks == null) {
155            loadAndPreflightPdf();
156            uriLinks = parseForLinks(PDActionURI.SUB_TYPE);
157        }
158        return uriLinks;
159    }
160
161    private List<LinkInfo> parseForLinks(String inSubType) throws IOException {
162        PDActionRemoteGoTo goTo;
163        PDActionLaunch launch;
164        PDActionURI uri;
165        PDFileSpecification fspec;
166        List<LinkInfo> li = new ArrayList<>();
167        int pageno = 0;
168        for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) {
169            pageno++;
170            stripper.extractRegions(page);
171            List<PDAnnotation> annotations = page.getAnnotations();
172            for (PDAnnotation annot : annotations) {
173                if (!(annot instanceof PDAnnotationLink)) {
174                    continue;
175                }
176                PDAnnotationLink link = (PDAnnotationLink) annot;
177                PDAction action = link.getAction();
178                if (!action.getSubType().equals(inSubType)) {
179                    continue;
180                }
181                String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot)));
182                String urlValue = null;
183                switch (inSubType) {
184                case PDActionRemoteGoTo.SUB_TYPE:
185                    goTo = (PDActionRemoteGoTo) action;
186                    fspec = goTo.getFile();
187                    urlValue = fspec.getFile();
188                    break;
189                case PDActionLaunch.SUB_TYPE:
190                    launch = (PDActionLaunch) action;
191                    fspec = launch.getFile();
192                    urlValue = fspec.getFile();
193                    break;
194                case PDActionURI.SUB_TYPE:
195                    uri = (PDActionURI) action;
196                    urlValue = uri.getURI();
197                    break;
198                // others...
199                }
200                if (StringUtils.isNotBlank(urlValue)) {
201                    li.add(new LinkInfo(pageno, inSubType, urlText, urlValue));
202                }
203            }
204        }
205        return li;
206    }
207
208    public void setPassword(String password) {
209        this.password = password;
210    }
211
212}