001/* 002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Julien Anguenot 018 * Florent Guillaume 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.File; 023import java.io.FileInputStream; 024import java.io.FileOutputStream; 025import java.io.IOException; 026import java.io.OutputStream; 027import java.io.Serializable; 028import java.util.HashSet; 029import java.util.List; 030import java.util.Map; 031import java.util.Set; 032 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.apache.pdfbox.contentstream.operator.Operator; 036import org.apache.pdfbox.cos.COSBase; 037import org.apache.pdfbox.pdmodel.PDDocument; 038import org.apache.pdfbox.pdmodel.encryption.AccessPermission; 039import org.apache.pdfbox.text.PDFTextStripper; 040import org.nuxeo.ecm.core.api.Blob; 041import org.nuxeo.ecm.core.api.Blobs; 042import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 043import org.nuxeo.ecm.core.convert.api.ConversionException; 044import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 045import org.nuxeo.ecm.core.convert.extension.Converter; 046import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 047import org.nuxeo.runtime.api.Framework; 048 049public class PDF2TextConverter implements Converter { 050 051 public static class PatchedPDFTextStripper extends PDFTextStripper { 052 053 public PatchedPDFTextStripper() throws IOException { 054 super(); 055 // platform independent line and paragraph separators 056 setLineSeparator("\n"); 057 setParagraphEnd("\n\n"); 058 setArticleEnd("\n\n"); 059 } 060 061 final static Set<StackTraceElement> loggedStacks = new HashSet<>(); 062 063 @Override 064 protected void operatorException(Operator operator, List<COSBase> operands, IOException e) throws IOException { 065 StackTraceElement root = e.getStackTrace()[0]; 066 synchronized (loggedStacks) { 067 if (loggedStacks.contains(root)) { 068 return; 069 } 070 loggedStacks.add(root); 071 } 072 log.warn("Caught error in pdfbox during extraction (stack logged only once)", e); 073 } 074 075 } 076 077 private static final Log log = LogFactory.getLog(PDF2TextConverter.class); 078 079 @Override 080 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 081 082 PDDocument document = null; 083 File f = null; 084 OutputStream fas = null; 085 try { 086 document = PDDocument.load(blobHolder.getBlob().getStream()); 087 // NXP-1556: if document is protected an IOException will be raised 088 // Instead of catching the exception based on its message string 089 // lets avoid sending messages that will generate this error 090 // code taken from PDFTextStripper.writeText source. 091 // only care about standard encryption and if it was decrypted with 092 // the user password 093 AccessPermission permission = document.getCurrentAccessPermission(); 094 if (permission.canExtractContent()) { 095 PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); 096 097 // use the position information to heuristically organize the 098 // extracted paragraphs. This is also important for 099 // right-to-left languages. 100 textStripper.setSortByPosition(true); 101 102 String text = textStripper.getText(document); 103 // replace non breaking space by regular spaces (why?) 104 // text = text.replace("\u00a0", " "); 105 f = Framework.createTempFile("pdfboplugin", ".txt"); 106 fas = new FileOutputStream(f); 107 fas.write(text.getBytes("UTF-8")); 108 try (FileInputStream is = new FileInputStream(f)) { 109 Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); 110 return new SimpleCachableBlobHolder(blob); 111 } 112 } else { 113 return new SimpleCachableBlobHolder(Blobs.createBlob("")); 114 } 115 } catch (IOException e) { 116 throw new ConversionException("Error during text extraction with PDFBox", blobHolder, e); 117 } finally { 118 if (document != null) { 119 try { 120 document.close(); 121 } catch (IOException e) { 122 log.error("Error while closing PDFBox document", e); 123 } 124 } 125 if (fas != null) { 126 try { 127 fas.close(); 128 } catch (IOException e) { 129 log.error(e); 130 } 131 } 132 if (f != null) { 133 f.delete(); 134 } 135 } 136 } 137 138 @Override 139 public void init(ConverterDescriptor descriptor) { 140 } 141 142}