001/* 002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.awt.geom.Rectangle2D; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.List; 026 027import org.apache.commons.lang3.StringUtils; 028import org.apache.pdfbox.pdmodel.PDDocument; 029import org.apache.pdfbox.pdmodel.PDPage; 030import org.apache.pdfbox.pdmodel.common.PDRectangle; 031import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification; 032import org.apache.pdfbox.pdmodel.interactive.action.PDAction; 033import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch; 034import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo; 035import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; 036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 037import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; 038import org.apache.pdfbox.text.PDFTextStripperByArea; 039import org.nuxeo.ecm.core.api.Blob; 040import org.nuxeo.ecm.core.api.NuxeoException; 041 042/** 043 * Extract links as list of {@link LinkInfo} from a PDF. 044 * <p> 045 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF 046 * has a <i>relative</i> link to an external PDF). 047 * <p> 048 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links. 049 * <p> 050 * <b>IMPORTANT</b> 051 * <p> 052 * Because we can parse the documents several times to get different links, we don't close it after every call 053 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks. 054 * 055 * @since 8.10 056 */ 057public class PDFLinks { 058 059 private Blob pdfBlob; 060 061 private PDDocument pdfDoc; 062 063 private String password; 064 065 private List<LinkInfo> remoteGoToLinks; 066 067 private List<LinkInfo> launchLinks; 068 069 private List<LinkInfo> uriLinks; 070 071 private PDFTextStripperByArea stripper; 072 073 public PDFLinks(Blob inBlob) { 074 pdfBlob = inBlob; 075 } 076 077 /** 078 * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is 079 * important that the caller explcitly closes it. 080 */ 081 public void close() { 082 PDFUtils.closeSilently(pdfDoc); 083 pdfDoc = null; 084 pdfBlob = null; 085 password = null; 086 remoteGoToLinks = null; 087 launchLinks = null; 088 stripper = null; 089 } 090 091 /** 092 * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation 093 * rectangles. 094 */ 095 private void loadAndPreflightPdf() throws NuxeoException { 096 if (pdfDoc != null) { 097 return; 098 } 099 pdfDoc = PDFUtils.load(pdfBlob, password); 100 try { 101 stripper = new PDFTextStripperByArea(); 102 for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) { 103 List<?> pageAnnotations = page.getAnnotations(); 104 for (Object annotationObject : pageAnnotations) { 105 PDAnnotation annot = (PDAnnotation) annotationObject; 106 if (!(annot instanceof PDAnnotationLink)) { 107 continue; 108 } 109 PDAnnotationLink link = (PDAnnotationLink) annot; 110 PDRectangle rect = link.getRectangle(); 111 // need to reposition link rectangle to match text space 112 float x = rect.getLowerLeftX(), y = rect.getUpperRightY(); 113 float width = rect.getWidth(), height = rect.getHeight(); 114 int rotation = page.getRotation(); 115 if (rotation == 0) { 116 PDRectangle pageSize = page.getMediaBox(); 117 y = pageSize.getHeight() - y; 118 } 119 Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); 120 stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect); 121 } 122 } 123 } catch (IOException e) { 124 throw new NuxeoException("Cannot preflight and prepare regions", e); 125 } 126 } 127 128 /** 129 * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}). 130 */ 131 public List<LinkInfo> getRemoteGoToLinks() throws IOException { 132 if (remoteGoToLinks == null) { 133 loadAndPreflightPdf(); 134 remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE); 135 } 136 return remoteGoToLinks; 137 } 138 139 /** 140 * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}). 141 */ 142 public List<LinkInfo> getLaunchLinks() throws IOException { 143 if (launchLinks == null) { 144 loadAndPreflightPdf(); 145 launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE); 146 } 147 return launchLinks; 148 } 149 150 /** 151 * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}). 152 */ 153 public List<LinkInfo> getURILinks() throws IOException { 154 if (uriLinks == null) { 155 loadAndPreflightPdf(); 156 uriLinks = parseForLinks(PDActionURI.SUB_TYPE); 157 } 158 return uriLinks; 159 } 160 161 private List<LinkInfo> parseForLinks(String inSubType) throws IOException { 162 PDActionRemoteGoTo goTo; 163 PDActionLaunch launch; 164 PDActionURI uri; 165 PDFileSpecification fspec; 166 List<LinkInfo> li = new ArrayList<>(); 167 int pageno = 0; 168 for (PDPage page : pdfDoc.getDocumentCatalog().getPages()) { 169 pageno++; 170 stripper.extractRegions(page); 171 List<PDAnnotation> annotations = page.getAnnotations(); 172 for (PDAnnotation annot : annotations) { 173 if (!(annot instanceof PDAnnotationLink)) { 174 continue; 175 } 176 PDAnnotationLink link = (PDAnnotationLink) annot; 177 PDAction action = link.getAction(); 178 if (!action.getSubType().equals(inSubType)) { 179 continue; 180 } 181 String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot))); 182 String urlValue = null; 183 switch (inSubType) { 184 case PDActionRemoteGoTo.SUB_TYPE: 185 goTo = (PDActionRemoteGoTo) action; 186 fspec = goTo.getFile(); 187 urlValue = fspec.getFile(); 188 break; 189 case PDActionLaunch.SUB_TYPE: 190 launch = (PDActionLaunch) action; 191 fspec = launch.getFile(); 192 urlValue = fspec.getFile(); 193 break; 194 case PDActionURI.SUB_TYPE: 195 uri = (PDActionURI) action; 196 urlValue = uri.getURI(); 197 break; 198 // others... 199 } 200 if (StringUtils.isNotBlank(urlValue)) { 201 li.add(new LinkInfo(pageno, inSubType, urlText, urlValue)); 202 } 203 } 204 } 205 return li; 206 } 207 208 public void setPassword(String password) { 209 this.password = password; 210 } 211 212}