001/* 002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Julien Anguenot 018 * Florent Guillaume 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.File; 023import java.io.FileInputStream; 024import java.io.FileOutputStream; 025import java.io.IOException; 026import java.io.OutputStream; 027import java.io.Serializable; 028import java.lang.reflect.Field; 029import java.util.HashSet; 030import java.util.List; 031import java.util.Map; 032import java.util.Set; 033 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036import org.apache.pdfbox.cos.COSBase; 037import org.apache.pdfbox.pdmodel.PDDocument; 038import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 039import org.apache.pdfbox.util.PDFOperator; 040import org.apache.pdfbox.util.PDFStreamEngine; 041import org.apache.pdfbox.util.PDFTextStripper; 042import org.apache.pdfbox.util.operator.OperatorProcessor; 043 044import org.nuxeo.ecm.core.api.Blob; 045import org.nuxeo.ecm.core.api.Blobs; 046import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 047import org.nuxeo.ecm.core.convert.api.ConversionException; 048import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 049import org.nuxeo.ecm.core.convert.extension.Converter; 050import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 051import org.nuxeo.runtime.api.Framework; 052 053public class PDF2TextConverter implements Converter { 054 055 public static class PatchedPDFTextStripper extends PDFTextStripper { 056 057 public PatchedPDFTextStripper() throws IOException { 058 super(); 059 // platform independent line and paragraph separators 060 setLineSeparator("\n"); 061 setParagraphEnd("\n\n"); 062 setArticleEnd("\n\n"); 063 } 064 065 protected Object unrestrictedAccess(String name) { 066 try { 067 Field f = PDFStreamEngine.class.getDeclaredField(name); 068 f.setAccessible(true); 069 return f.get(this); 070 } catch (ReflectiveOperationException e) { 071 throw new RuntimeException("Cannot get access to PDFStreamEngine fields", e); 072 } 073 } 074 075 @SuppressWarnings("unchecked") 076 protected Set<String> unsupportedOperators() { 077 return (Set<String>) unrestrictedAccess("unsupportedOperators"); 078 } 079 080 @SuppressWarnings("unchecked") 081 protected Map<String, OperatorProcessor> operators() { 082 return (Map<String, OperatorProcessor>) unrestrictedAccess("operators"); 083 } 084 085 final static Set<StackTraceElement> loggedStacks = new HashSet<>(); 086 087 @Override 088 protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException { 089 try { 090 String operation = operator.getOperation(); 091 OperatorProcessor processor = operators().get(operation); 092 if (processor != null) { 093 processor.setContext(this); 094 processor.process(operator, arguments); 095 } else { 096 if (!unsupportedOperators().contains(operation)) { 097 log.info("unsupported/disabled operation: " + operation); 098 unsupportedOperators().add(operation); 099 } 100 } 101 } catch (IOException e) { 102 StackTraceElement root = e.getStackTrace()[0]; 103 synchronized (loggedStacks) { 104 if (loggedStacks.contains(root)) { 105 return; 106 } 107 loggedStacks.add(root); 108 } 109 log.warn("Caught error in pdfbox during extraction (stack logged only once)", e); 110 } 111 } 112 113 } 114 115 private static final Log log = LogFactory.getLog(PDF2TextConverter.class); 116 117 @Override 118 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 119 120 PDDocument document = null; 121 File f = null; 122 OutputStream fas = null; 123 try { 124 document = PDDocument.load(blobHolder.getBlob().getStream()); 125 // NXP-1556: if document is protected an IOException will be raised 126 // Instead of catching the exception based on its message string 127 // lets avoid sending messages that will generate this error 128 // code taken from PDFTextStripper.writeText source. 129 // only care about standard encryption and if it was decrypted with 130 // the user password 131 AccessPermission permission = document.getCurrentAccessPermission(); 132 if (permission.canExtractContent()) { 133 PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); 134 135 // use the position information to heuristically organize the 136 // extracted paragraphs. This is also important for 137 // right-to-left languages. 138 textStripper.setSortByPosition(true); 139 140 String text = textStripper.getText(document); 141 // replace non breaking space by regular spaces (why?) 142 // text = text.replace("\u00a0", " "); 143 f = Framework.createTempFile("pdfboplugin", ".txt"); 144 fas = new FileOutputStream(f); 145 fas.write(text.getBytes("UTF-8")); 146 try (FileInputStream is = new FileInputStream(f)) { 147 Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); 148 return new SimpleCachableBlobHolder(blob); 149 } 150 } else { 151 return new SimpleCachableBlobHolder(Blobs.createBlob("")); 152 } 153 } catch (IOException e) { 154 throw new ConversionException("Error during text extraction with PDFBox", e); 155 } finally { 156 if (document != null) { 157 try { 158 document.close(); 159 } catch (IOException e) { 160 log.error("Error while closing PDFBox document", e); 161 } 162 } 163 if (fas != null) { 164 try { 165 fas.close(); 166 } catch (IOException e) { 167 log.error(e); 168 } 169 } 170 if (f != null) { 171 f.delete(); 172 } 173 } 174 } 175 176 @Override 177 public void init(ConverterDescriptor descriptor) { 178 } 179 180}