Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.IOException;
023import org.apache.commons.lang.StringUtils;
024import org.apache.pdfbox.pdmodel.PDDocument;
025import org.apache.pdfbox.util.PDFTextStripper;
026import org.nuxeo.ecm.core.api.Blob;
027import org.nuxeo.ecm.core.api.NuxeoException;
028import org.nuxeo.ecm.core.api.DocumentModel;
029
030/**
031 * Extracts raw text from a PDF.
032 *
033 * @since 8.10
034 */
035public class PDFTextExtractor {
036
037    private Blob pdfBlob;
038
039    private String password;
040
041    private String extractedAllAsString;
042
043    private static final String END_OF_LINE = "\n";
044
045    public PDFTextExtractor(Blob inBlob) {
046        pdfBlob = inBlob;
047    }
048
049    /**
050     * Constructor with a <code>DocumentModel</code>. The default value for <code>inXPath</code> (if passed
051     * <code>null</code> or "") is <code>file:content</code>.
052     *
053     * @param inDoc Input DocumentModel.
054     * @param inXPath Input XPath.
055     */
056    public PDFTextExtractor(DocumentModel inDoc, String inXPath) {
057        if (StringUtils.isBlank(inXPath)) {
058            inXPath = "file:content";
059        }
060        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
061    }
062
063    public String getAllExtractedLines() throws NuxeoException {
064        if (extractedAllAsString == null) {
065            try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
066                PDFTextStripper stripper = new PDFTextStripper();
067                extractedAllAsString = stripper.getText(pdfDoc);
068            } catch (IOException e) {
069                throw new NuxeoException("Failed to handle the pdf", e);
070            }
071        }
072        return extractedAllAsString;
073    }
074
075    public String extractLineOf(String inString) throws IOException {
076        String extractedLine = null;
077        int lineBegining = getAllExtractedLines().indexOf(inString);
078        if (lineBegining != -1) {
079            int lineEnd = getAllExtractedLines().indexOf(END_OF_LINE, lineBegining);
080            extractedLine = getAllExtractedLines().substring(lineBegining, lineEnd).trim();
081        }
082        return extractedLine;
083    }
084
085    public String extractLastPartOfLine(String string) throws IOException {
086        String extractedLine = extractLineOf(string);
087        if (extractedLine != null) {
088            return extractedLine.substring(string.length(), extractedLine.length());
089        }
090        return null;
091    }
092
093    public void setPassword(String password) {
094        this.password = password;
095    }
096
097}
098