001/* 002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.awt.geom.Rectangle2D; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.List; 026 027import org.apache.commons.lang3.StringUtils; 028import org.apache.pdfbox.pdmodel.PDDocument; 029import org.apache.pdfbox.pdmodel.PDPage; 030import org.apache.pdfbox.pdmodel.common.PDRectangle; 031import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification; 032import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; 033import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionLaunch; 034import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionRemoteGoTo; 035import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; 036import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 037import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; 038import org.apache.pdfbox.util.PDFTextStripperByArea; 039import org.nuxeo.ecm.core.api.Blob; 040import org.nuxeo.ecm.core.api.NuxeoException; 041 042/** 043 * Extract links as list of {@link LinkInfo} from a PDF. 044 * <p> 045 * In this first version, extracts only the links of type PDActionRemoteGoTo and PDActionLaunch (typically, when a PDF 046 * has a <i>relative</i> link to an external PDF). 047 * <p> 048 * If the PDF is encrypted, a call to <code>setPassword</code> must be done before any attempt to get the links. 049 * <p> 050 * <b>IMPORTANT</b> 051 * <p> 052 * Because we can parse the documents several times to get different links, we don't close it after every call 053 * (optimization), it is the caller responsibility to explicitly close it to avoid leaks. 054 * 055 * @since 8.10 056 */ 057public class PDFLinks { 058 059 private Blob pdfBlob; 060 061 private PDDocument pdfDoc; 062 063 private String password; 064 065 private List<LinkInfo> remoteGoToLinks; 066 067 private List<LinkInfo> launchLinks; 068 069 private List<LinkInfo> uriLinks; 070 071 private PDFTextStripperByArea stripper; 072 073 public PDFLinks(Blob inBlob) { 074 pdfBlob = inBlob; 075 } 076 077 /** 078 * To avoid opening/parsing several times the same document, we don't close it after a get...Link() call. It is 079 * important that the caller explcitly closes it. 080 */ 081 public void close() { 082 PDFUtils.closeSilently(pdfDoc); 083 pdfDoc = null; 084 pdfBlob = null; 085 password = null; 086 remoteGoToLinks = null; 087 launchLinks = null; 088 stripper = null; 089 } 090 091 /** 092 * Here, we not only open and load the PDF, we also prepare regions to get the text behind the annotation 093 * rectangles. 094 */ 095 private void loadAndPreflightPdf() throws NuxeoException { 096 if (pdfDoc != null) { 097 return; 098 } 099 pdfDoc = PDFUtils.load(pdfBlob, password); 100 try { 101 stripper = new PDFTextStripperByArea(); 102 for (Object pageObject : pdfDoc.getDocumentCatalog().getAllPages()) { 103 PDPage page = (PDPage) pageObject; 104 List pageAnnotations = page.getAnnotations(); 105 for (Object annotationObject : pageAnnotations) { 106 PDAnnotation annot = (PDAnnotation) annotationObject; 107 if (!(annot instanceof PDAnnotationLink)) { 108 continue; 109 } 110 PDAnnotationLink link = (PDAnnotationLink) annot; 111 PDRectangle rect = link.getRectangle(); 112 // need to reposition link rectangle to match text space 113 float x = rect.getLowerLeftX(), y = rect.getUpperRightY(); 114 float width = rect.getWidth(), height = rect.getHeight(); 115 int rotation = page.findRotation(); 116 if (rotation == 0) { 117 PDRectangle pageSize = page.findMediaBox(); 118 y = pageSize.getHeight() - y; 119 } 120 Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); 121 stripper.addRegion(String.valueOf(pageAnnotations.indexOf(annot)), awtRect); 122 } 123 } 124 } catch (IOException e) { 125 throw new NuxeoException("Cannot preflight and prepare regions", e); 126 } 127 } 128 129 /** 130 * Return all links of type "GoToR" ({@link PDActionRemoteGoTo#SUB_TYPE}). 131 */ 132 public List<LinkInfo> getRemoteGoToLinks() throws IOException { 133 if (remoteGoToLinks == null) { 134 loadAndPreflightPdf(); 135 remoteGoToLinks = parseForLinks(PDActionRemoteGoTo.SUB_TYPE); 136 } 137 return remoteGoToLinks; 138 } 139 140 /** 141 * Return all links of type "Launch" ({@link PDActionLaunch#SUB_TYPE}). 142 */ 143 public List<LinkInfo> getLaunchLinks() throws IOException { 144 if (launchLinks == null) { 145 loadAndPreflightPdf(); 146 launchLinks = parseForLinks(PDActionLaunch.SUB_TYPE); 147 } 148 return launchLinks; 149 } 150 151 /** 152 * Return all links of type "URI" ({@link PDActionURI#SUB_TYPE}). 153 */ 154 public List<LinkInfo> getURILinks() throws IOException { 155 if (uriLinks == null) { 156 loadAndPreflightPdf(); 157 uriLinks = parseForLinks(PDActionURI.SUB_TYPE); 158 } 159 return uriLinks; 160 } 161 162 private List<LinkInfo> parseForLinks(String inSubType) throws IOException { 163 PDActionRemoteGoTo goTo; 164 PDActionLaunch launch; 165 PDActionURI uri; 166 PDFileSpecification fspec; 167 List<LinkInfo> li = new ArrayList<>(); 168 List allPages = pdfDoc.getDocumentCatalog().getAllPages(); 169 for (Object pageObject : allPages) { 170 PDPage page = (PDPage) pageObject; 171 stripper.extractRegions(page); 172 List<PDAnnotation> annotations = page.getAnnotations(); 173 for (PDAnnotation annot : annotations) { 174 if (!(annot instanceof PDAnnotationLink)) { 175 continue; 176 } 177 PDAnnotationLink link = (PDAnnotationLink) annot; 178 PDAction action = link.getAction(); 179 if (!action.getSubType().equals(inSubType)) { 180 continue; 181 } 182 String urlText = stripper.getTextForRegion(String.valueOf(annotations.indexOf(annot))); 183 String urlValue = null; 184 switch (inSubType) { 185 case PDActionRemoteGoTo.SUB_TYPE: 186 goTo = (PDActionRemoteGoTo) action; 187 fspec = goTo.getFile(); 188 urlValue = fspec.getFile(); 189 break; 190 case PDActionLaunch.SUB_TYPE: 191 launch = (PDActionLaunch) action; 192 fspec = launch.getFile(); 193 urlValue = fspec.getFile(); 194 break; 195 case PDActionURI.SUB_TYPE: 196 uri = (PDActionURI) action; 197 urlValue = uri.getURI(); 198 break; 199 // others... 200 } 201 if (StringUtils.isNotBlank(urlValue)) { 202 li.add(new LinkInfo(allPages.indexOf(page) + 1, inSubType, urlText, urlValue)); 203 } 204 } 205 } 206 return li; 207 } 208 209 public void setPassword(String password) { 210 this.password = password; 211 } 212 213}