001/* 002 * (C) Copyright 2002-2010 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Julien Anguenot 016 * Florent Guillaume 017 */ 018package org.nuxeo.ecm.core.convert.plugins.text.extractors; 019 020import java.io.File; 021import java.io.FileInputStream; 022import java.io.FileOutputStream; 023import java.io.IOException; 024import java.io.OutputStream; 025import java.io.Serializable; 026import java.lang.reflect.Field; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Set; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.apache.pdfbox.cos.COSBase; 035import org.apache.pdfbox.pdmodel.PDDocument; 036import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 037import org.apache.pdfbox.util.PDFOperator; 038import org.apache.pdfbox.util.PDFStreamEngine; 039import org.apache.pdfbox.util.PDFTextStripper; 040import org.apache.pdfbox.util.operator.OperatorProcessor; 041import org.nuxeo.ecm.core.api.Blob; 042import org.nuxeo.ecm.core.api.Blobs; 043import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 044import org.nuxeo.ecm.core.convert.api.ConversionException; 045import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 046import org.nuxeo.ecm.core.convert.extension.Converter; 047import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 048 049public class PDF2TextConverter implements Converter { 050 051 public static class PatchedPDFTextStripper extends PDFTextStripper { 052 053 public PatchedPDFTextStripper() throws IOException { 054 super(); 055 // platform independent line and paragraph separators 056 setLineSeparator("\n"); 057 setParagraphEnd("\n\n"); 058 setArticleEnd("\n\n"); 059 } 060 061 protected Object unrestrictedAccess(String name) { 062 try { 063 Field f = PDFStreamEngine.class.getDeclaredField(name); 064 f.setAccessible(true); 065 return f.get(this); 066 } catch (ReflectiveOperationException e) { 067 throw new RuntimeException("Cannot get access to PDFStreamEngine fields", e); 068 } 069 } 070 071 @SuppressWarnings("unchecked") 072 protected Set<String> unsupportedOperators() { 073 return (Set<String>) unrestrictedAccess("unsupportedOperators"); 074 } 075 076 @SuppressWarnings("unchecked") 077 protected Map<String, OperatorProcessor> operators() { 078 return (Map<String, OperatorProcessor>) unrestrictedAccess("operators"); 079 } 080 081 final static Set<StackTraceElement> loggedStacks = new HashSet<StackTraceElement>(); 082 083 @Override 084 protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException { 085 try { 086 087 String operation = operator.getOperation(); 088 OperatorProcessor processor = operators().get(operation); 089 if (processor != null) { 090 processor.setContext(this); 091 processor.process(operator, arguments); 092 } else { 093 if (!unsupportedOperators().contains(operation)) { 094 log.info("unsupported/disabled operation: " + operation); 095 unsupportedOperators().add(operation); 096 } 097 } 098 } catch (IOException e) { 099 StackTraceElement root = e.getStackTrace()[0]; 100 synchronized (loggedStacks) { 101 if (loggedStacks.contains(root)) { 102 return; 103 } 104 loggedStacks.add(root); 105 } 106 log.warn("Caught error in pdfbox during extraction (stack logged only once)", e); 107 } 108 } 109 110 } 111 112 private static final Log log = LogFactory.getLog(PDF2TextConverter.class); 113 114 @Override 115 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 116 117 PDDocument document = null; 118 File f = null; 119 OutputStream fas = null; 120 try { 121 document = PDDocument.load(blobHolder.getBlob().getStream()); 122 // NXP-1556: if document is protected an IOException will be raised 123 // Instead of catching the exception based on its message string 124 // lets avoid sending messages that will generate this error 125 // code taken from PDFTextStripper.writeText source. 126 // only care about standard encryption and if it was decrypted with 127 // the user password 128 AccessPermission permission = document.getCurrentAccessPermission(); 129 if (permission.canExtractContent()) { 130 PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); 131 132 // use the position information to heuristically organize the 133 // extracted paragraphs. This is also important for 134 // right-to-left languages. 135 textStripper.setSortByPosition(true); 136 137 String text = textStripper.getText(document); 138 // replace non breaking space by regular spaces (why?) 139 // text = text.replace("\u00a0", " "); 140 f = File.createTempFile("pdfboplugin", ".txt"); 141 fas = new FileOutputStream(f); 142 fas.write(text.getBytes("UTF-8")); 143 try (FileInputStream is = new FileInputStream(f)) { 144 Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); 145 return new SimpleCachableBlobHolder(blob); 146 } 147 } else { 148 return new SimpleCachableBlobHolder(Blobs.createBlob("")); 149 } 150 } catch (IOException e) { 151 throw new ConversionException("Error during text extraction with PDFBox", e); 152 } finally { 153 if (document != null) { 154 try { 155 document.close(); 156 } catch (IOException e) { 157 log.error("Error while closing PDFBox document", e); 158 } 159 } 160 if (fas != null) { 161 try { 162 fas.close(); 163 } catch (IOException e) { 164 log.error(e); 165 } 166 } 167 if (f != null) { 168 f.delete(); 169 } 170 } 171 } 172 173 @Override 174 public void init(ConverterDescriptor descriptor) { 175 } 176 177}