001/* 002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf; 021 022import java.io.IOException; 023 024import org.apache.commons.lang3.StringUtils; 025import org.apache.pdfbox.pdmodel.PDDocument; 026import org.apache.pdfbox.util.PDFTextStripper; 027import org.nuxeo.ecm.core.api.Blob; 028import org.nuxeo.ecm.core.api.DocumentModel; 029import org.nuxeo.ecm.core.api.NuxeoException; 030 031/** 032 * Extracts raw text from a PDF. 033 * 034 * @since 8.10 035 */ 036public class PDFTextExtractor { 037 038 private Blob pdfBlob; 039 040 private String password; 041 042 private String extractedAllAsString; 043 044 private static final String END_OF_LINE = "\n"; 045 046 public PDFTextExtractor(Blob inBlob) { 047 pdfBlob = inBlob; 048 } 049 050 /** 051 * Constructor with a <code>DocumentModel</code>. The default value for <code>inXPath</code> (if passed 052 * <code>null</code> or "") is <code>file:content</code>. 053 * 054 * @param inDoc Input DocumentModel. 055 * @param inXPath Input XPath. 056 */ 057 public PDFTextExtractor(DocumentModel inDoc, String inXPath) { 058 if (StringUtils.isBlank(inXPath)) { 059 inXPath = "file:content"; 060 } 061 pdfBlob = (Blob) inDoc.getPropertyValue(inXPath); 062 } 063 064 public String getAllExtractedLines() throws NuxeoException { 065 if (extractedAllAsString == null) { 066 try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) { 067 PDFTextStripper stripper = new PDFTextStripper(); 068 extractedAllAsString = stripper.getText(pdfDoc); 069 } catch (IOException e) { 070 throw new NuxeoException("Failed to handle the pdf", e); 071 } 072 } 073 return extractedAllAsString; 074 } 075 076 public String extractLineOf(String inString) throws IOException { 077 String extractedLine = null; 078 int lineBegining = getAllExtractedLines().indexOf(inString); 079 if (lineBegining != -1) { 080 int lineEnd = getAllExtractedLines().indexOf(END_OF_LINE, lineBegining); 081 extractedLine = getAllExtractedLines().substring(lineBegining, lineEnd).trim(); 082 } 083 return extractedLine; 084 } 085 086 public String extractLastPartOfLine(String string) throws IOException { 087 String extractedLine = extractLineOf(string); 088 if (extractedLine != null) { 089 return extractedLine.substring(string.length(), extractedLine.length()); 090 } 091 return null; 092 } 093 094 public void setPassword(String password) { 095 this.password = password; 096 } 097 098}