001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.IOException; 023import org.apache.commons.lang.StringUtils; 024import org.apache.pdfbox.pdmodel.PDDocument; 025import org.apache.pdfbox.util.PDFTextStripper; 026import org.nuxeo.ecm.core.api.Blob; 027import org.nuxeo.ecm.core.api.NuxeoException; 028import org.nuxeo.ecm.core.api.DocumentModel; 029 030/** 031 * Extracts raw text from a PDF. 032 * 033 * @since 8.10 034 */ 035public class PDFTextExtractor { 036 037 private Blob pdfBlob; 038 039 private String password; 040 041 private String extractedAllAsString; 042 043 private static final String END_OF_LINE = "\n"; 044 045 public PDFTextExtractor(Blob inBlob) { 046 pdfBlob = inBlob; 047 } 048 049 /** 050 * Constructor with a <code>DocumentModel</code>. The default value for <code>inXPath</code> (if passed 051 * <code>null</code> or "") is <code>file:content</code>. 052 * 053 * @param inDoc Input DocumentModel. 054 * @param inXPath Input XPath. 055 */ 056 public PDFTextExtractor(DocumentModel inDoc, String inXPath) { 057 if (StringUtils.isBlank(inXPath)) { 058 inXPath = "file:content"; 059 } 060 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 061 } 062 063 public String getAllExtractedLines() throws NuxeoException { 064 if (extractedAllAsString == null) { 065 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 066 PDFTextStripper stripper = new PDFTextStripper(); 067 extractedAllAsString = stripper.getText(pdfDoc); 068 } catch (IOException e) { 069 throw new NuxeoException("Failed to handle the pdf", e); 070 } 071 } 072 return extractedAllAsString; 073 } 074 075 public String extractLineOf(String inString) throws IOException { 076 String extractedLine = null; 077 int lineBegining = getAllExtractedLines().indexOf(inString); 078 if (lineBegining != -1) { 079 int lineEnd = getAllExtractedLines().indexOf(END_OF_LINE, lineBegining); 080 extractedLine = getAllExtractedLines().substring(lineBegining, lineEnd).trim(); 081 } 082 return extractedLine; 083 } 084 085 public String extractLastPartOfLine(String string) throws IOException { 086 String extractedLine = extractLineOf(string); 087 if (extractedLine != null) { 088 return extractedLine.substring(string.length(), extractedLine.length()); 089 } 090 return null; 091 } 092 093 public void setPassword(String password) { 094 this.password = password; 095 } 096 097} 098