001/*
002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.awt.geom.Rectangle2D;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.List;
026
027import org.apache.commons.lang3.StringUtils;
028import org.apache.pdfbox.pdmodel.PDDocument;
029import org.apache.pdfbox.pdmodel.PDPage;
030import org.apache.pdfbox.pdmodel.common.PDRectangle;
031import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
032import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
033import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionLaunch;
034import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionRemoteGoTo;
035import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
037import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
038import org.apache.pdfbox.util.PDFTextStripperByArea;
039import org.nuxeo.ecm.core.api.Blob;
040import org.nuxeo.ecm.core.api.NuxeoException;
041
042/**
043 * Extract links as list of {@link LinkInfo} from a PDF.
044 * <p>
045 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF
046 * has a <i>relative</i> link to an external PDF).
047 * <p>
048 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links.
049 * <p>
050 * <b>IMPORTANT</b>
051 * <p>
052 * Because we can parse the documents several times to get different links, we don't close it after every call
053 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks.
054 *
055 * @since 8.10
056 */
057public class PDFLinks {
058
059    private Blob pdfBlob;
060
061    private PDDocument pdfDoc;
062
063    private String password;
064
065    private List<LinkInfo> remoteGoToLinks;
066
067    private List<LinkInfo> launchLinks;
068
069    private List<LinkInfo> uriLinks;
070
071    private PDFTextStripperByArea stripper;
072
073    public PDFLinks(Blob inBlob) {
074        pdfBlob = inBlob;
075    }
076
077    /**
078     * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is
079     * important that the caller explcitly closes it.
080     */
081    public void close() {
082        PDFUtils.closeSilently(pdfDoc);
083        pdfDoc = null;
084        pdfBlob = null;
085        password = null;
086        remoteGoToLinks = null;
087        launchLinks = null;
088        stripper = null;
089    }
090
091    /**
092     * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation
093     * rectangles.
094     */
095    private void loadAndPreflightPdf() throws NuxeoException {
096        if (pdfDoc != null) {
097            return;
098        }
099        pdfDoc = PDFUtils.load(pdfBlob, password);
100        try {
101            stripper = new PDFTextStripperByArea();
102            for (Object pageObject : pdfDoc.getDocumentCatalog().getAllPages()) {
103                PDPage page = (PDPage) pageObject;
104                List pageAnnotations = page.getAnnotations();
105                for (Object annotationObject : pageAnnotations) {
106                    PDAnnotation annot = (PDAnnotation) annotationObject;
107                    if (!(annot instanceof PDAnnotationLink)) {
108                        continue;
109                    }
110                    PDAnnotationLink link = (PDAnnotationLink) annot;
111                    PDRectangle rect = link.getRectangle();
112                    // need to reposition link rectangle to match text space
113                    float x = rect.getLowerLeftX(), y = rect.getUpperRightY();
114                    float width = rect.getWidth(), height = rect.getHeight();
115                    int rotation = page.findRotation();
116                    if (rotation == 0) {
117                        PDRectangle pageSize = page.findMediaBox();
118                        y = pageSize.getHeight() - y;
119                    }
120                    Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
121                    stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect);
122                }
123            }
124        } catch (IOException e) {
125            throw new NuxeoException("Cannot preflight and prepare regions", e);
126        }
127    }
128
129    /**
130     * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}).
131     */
132    public List<LinkInfo> getRemoteGoToLinks() throws IOException {
133        if (remoteGoToLinks == null) {
134            loadAndPreflightPdf();
135            remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE);
136        }
137        return remoteGoToLinks;
138    }
139
140    /**
141     * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}).
142     */
143    public List<LinkInfo> getLaunchLinks() throws IOException {
144        if (launchLinks == null) {
145            loadAndPreflightPdf();
146            launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE);
147        }
148        return launchLinks;
149    }
150
151    /**
152     * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}).
153     */
154    public List<LinkInfo> getURILinks() throws IOException {
155        if (uriLinks == null) {
156            loadAndPreflightPdf();
157            uriLinks = parseForLinks(PDActionURI.SUB_TYPE);
158        }
159        return uriLinks;
160    }
161
162    private List<LinkInfo> parseForLinks(String inSubType) throws IOException {
163        PDActionRemoteGoTo goTo;
164        PDActionLaunch launch;
165        PDActionURI uri;
166        PDFileSpecification fspec;
167        List<LinkInfo> li = new ArrayList<>();
168        List allPages = pdfDoc.getDocumentCatalog().getAllPages();
169        for (Object pageObject : allPages) {
170            PDPage page = (PDPage) pageObject;
171            stripper.extractRegions(page);
172            List<PDAnnotation> annotations = page.getAnnotations();
173            for (PDAnnotation annot : annotations) {
174                if (!(annot instanceof PDAnnotationLink)) {
175                    continue;
176                }
177                PDAnnotationLink link = (PDAnnotationLink) annot;
178                PDAction action = link.getAction();
179                if (!action.getSubType().equals(inSubType)) {
180                    continue;
181                }
182                String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot)));
183                String urlValue = null;
184                switch (inSubType) {
185                case PDActionRemoteGoTo.SUB_TYPE:
186                    goTo = (PDActionRemoteGoTo) action;
187                    fspec = goTo.getFile();
188                    urlValue = fspec.getFile();
189                    break;
190                case PDActionLaunch.SUB_TYPE:
191                    launch = (PDActionLaunch) action;
192                    fspec = launch.getFile();
193                    urlValue = fspec.getFile();
194                    break;
195                case PDActionURI.SUB_TYPE:
196                    uri = (PDActionURI) action;
197                    urlValue = uri.getURI();
198                    break;
199                // others...
200                }
201                if (StringUtils.isNotBlank(urlValue)) {
202                    li.add(new LinkInfo(allPages.indexOf(page) + 1, inSubType, urlText, urlValue));
203                }
204            }
205        }
206        return li;
207    }
208
209    public void setPassword(String password) {
210        this.password = password;
211    }
212
213}