001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.awt.geom.Rectangle2D; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.List; 026import org.apache.commons.lang.StringUtils; 027import org.apache.pdfbox.pdmodel.PDDocument; 028import org.apache.pdfbox.pdmodel.PDPage; 029import org.apache.pdfbox.pdmodel.common.PDRectangle; 030import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification; 031import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; 032import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionLaunch; 033import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionRemoteGoTo; 034import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; 035import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; 037import org.apache.pdfbox.util.PDFTextStripperByArea; 038import org.nuxeo.ecm.core.api.Blob; 039import org.nuxeo.ecm.core.api.NuxeoException; 040 041/** 042 * Extract links as list of {@link LinkInfo} from a PDF. 043 * <p> 044 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF 045 * has a <i>relative</i> link to an external PDF). 046 * <p> 047 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links. 048 * <p> 049 * <b>IMPORTANT</b> 050 * <p> 051 * Because we can parse the documents several times to get different links, we don't close it after every call 052 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks. 053 * 054 * @since 8.10 055 */ 056public class PDFLinks { 057 058 private Blob pdfBlob; 059 060 private PDDocument pdfDoc; 061 062 private String password; 063 064 private List<LinkInfo> remoteGoToLinks; 065 066 private List<LinkInfo> launchLinks; 067 068 private List<LinkInfo> uriLinks; 069 070 private PDFTextStripperByArea stripper; 071 072 public PDFLinks(Blob inBlob) { 073 pdfBlob = inBlob; 074 } 075 076 /** 077 * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is 078 * important that the caller explcitly closes it. 079 */ 080 public void close() { 081 PDFUtils.closeSilently(pdfDoc); 082 pdfDoc = null; 083 pdfBlob = null; 084 password = null; 085 remoteGoToLinks = null; 086 launchLinks = null; 087 stripper = null; 088 } 089 090 /** 091 * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation 092 * rectangles. 093 */ 094 private void loadAndPreflightPdf() throws NuxeoException { 095 if (pdfDoc != null) { 096 return; 097 } 098 pdfDoc = PDFUtils.load(pdfBlob, password); 099 try { 100 stripper = new PDFTextStripperByArea(); 101 for (Object pageObject : pdfDoc.getDocumentCatalog().getAllPages()) { 102 PDPage page = (PDPage) pageObject; 103 List pageAnnotations = page.getAnnotations(); 104 for (Object annotationObject : pageAnnotations) { 105 PDAnnotation annot = (PDAnnotation) annotationObject; 106 if (!(annot instanceof PDAnnotationLink)) { 107 continue; 108 } 109 PDAnnotationLink link = (PDAnnotationLink) annot; 110 PDRectangle rect = link.getRectangle(); 111 // need to reposition link rectangle to match text space 112 float x = rect.getLowerLeftX(), y = rect.getUpperRightY(); 113 float width = rect.getWidth(), height = rect.getHeight(); 114 int rotation = page.findRotation(); 115 if (rotation == 0) { 116 PDRectangle pageSize = page.findMediaBox(); 117 y = pageSize.getHeight() - y; 118 } 119 Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); 120 stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect); 121 } 122 } 123 } catch (IOException e) { 124 throw new NuxeoException("Cannot preflight and prepare regions", e); 125 } 126 } 127 128 /** 129 * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}). 130 * 131 * @throws IOException 132 */ 133 public List<LinkInfo> getRemoteGoToLinks() throws IOException { 134 if (remoteGoToLinks == null) { 135 loadAndPreflightPdf(); 136 remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE); 137 } 138 return remoteGoToLinks; 139 } 140 141 /** 142 * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}). 143 * 144 * @throws IOException 145 */ 146 public List<LinkInfo> getLaunchLinks() throws IOException { 147 if (launchLinks == null) { 148 loadAndPreflightPdf(); 149 launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE); 150 } 151 return launchLinks; 152 } 153 154 /** 155 * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}). 156 * 157 * @throws IOException 158 */ 159 public List<LinkInfo> getURILinks() throws IOException { 160 if (uriLinks == null) { 161 loadAndPreflightPdf(); 162 uriLinks = parseForLinks(PDActionURI.SUB_TYPE); 163 } 164 return uriLinks; 165 } 166 167 private List<LinkInfo> parseForLinks(String inSubType) throws IOException { 168 PDActionRemoteGoTo goTo; 169 PDActionLaunch launch; 170 PDActionURI uri; 171 PDFileSpecification fspec; 172 List<LinkInfo> li = new ArrayList<>(); 173 List allPages = pdfDoc.getDocumentCatalog().getAllPages(); 174 for (Object pageObject : allPages) { 175 PDPage page = (PDPage) pageObject; 176 stripper.extractRegions(page); 177 List<PDAnnotation> annotations = page.getAnnotations(); 178 for (PDAnnotation annot : annotations) { 179 if (!(annot instanceof PDAnnotationLink)) { 180 continue; 181 } 182 PDAnnotationLink link = (PDAnnotationLink) annot; 183 PDAction action = link.getAction(); 184 if (!action.getSubType().equals(inSubType)) { 185 continue; 186 } 187 String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot))); 188 String urlValue = null; 189 switch (inSubType) { 190 case PDActionRemoteGoTo.SUB_TYPE: 191 goTo = (PDActionRemoteGoTo) action; 192 fspec = goTo.getFile(); 193 urlValue = fspec.getFile(); 194 break; 195 case PDActionLaunch.SUB_TYPE: 196 launch = (PDActionLaunch) action; 197 fspec = launch.getFile(); 198 urlValue = fspec.getFile(); 199 break; 200 case PDActionURI.SUB_TYPE: 201 uri = (PDActionURI) action; 202 urlValue = uri.getURI(); 203 break; 204 // others... 205 } 206 if (StringUtils.isNotBlank(urlValue)) { 207 li.add(new LinkInfo(allPages.indexOf(page) + 1, inSubType, urlText, urlValue)); 208 } 209 } 210 } 211 return li; 212 } 213 214 public void setPassword(String password) { 215 this.password = password; 216 } 217 218}