Source code

001/*
002 * (C) Copyright 2016-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf;
021
022import java.io.IOException;
023
024import org.apache.commons.lang3.StringUtils;
025import org.apache.pdfbox.pdmodel.PDDocument;
026import org.apache.pdfbox.text.PDFTextStripper;
027import org.nuxeo.ecm.core.api.Blob;
028import org.nuxeo.ecm.core.api.DocumentModel;
029import org.nuxeo.ecm.core.api.NuxeoException;
030
031/**
032 * Extracts raw text from a PDF.
033 *
034 * @since 8.10
035 */
036public class PDFTextExtractor {
037
038    private Blob pdfBlob;
039
040    private String password;
041
042    private String extractedAllAsString;
043
044    private static final String END_OF_LINE = "\n";
045
046    public PDFTextExtractor(Blob inBlob) {
047        pdfBlob = inBlob;
048    }
049
050    /**
051     * Constructor with a <code>DocumentModel</code>. The default value for <code>inXPath</code> (if passed
052     * <code>null</code> or "") is <code>file:content</code>.
053     *
054     * @param inDoc Input DocumentModel.
055     * @param inXPath Input XPath.
056     */
057    public PDFTextExtractor(DocumentModel inDoc, String inXPath) {
058        if (StringUtils.isBlank(inXPath)) {
059            inXPath = "file:content";
060        }
061        pdfBlob = (Blob) inDoc.getPropertyValue(inXPath);
062    }
063
064    public String getAllExtractedLines() throws NuxeoException {
065        if (extractedAllAsString == null) {
066            try (PDDocument pdfDoc = PDFUtils.load(pdfBlob, password)) {
067                PDFTextStripper stripper = new PDFTextStripper();
068                extractedAllAsString = stripper.getText(pdfDoc);
069            } catch (IOException e) {
070                throw new NuxeoException("Failed to handle the pdf", e);
071            }
072        }
073        return extractedAllAsString;
074    }
075
076    public String extractLineOf(String inString) throws IOException {
077        String extractedLine = null;
078        int lineBegining = getAllExtractedLines().indexOf(inString);
079        if (lineBegining != -1) {
080            int lineEnd = getAllExtractedLines().indexOf(END_OF_LINE, lineBegining);
081            extractedLine = getAllExtractedLines().substring(lineBegining, lineEnd).trim();
082        }
083        return extractedLine;
084    }
085
086    public String extractLastPartOfLine(String string) throws IOException {
087        String extractedLine = extractLineOf(string);
088        if (extractedLine != null) {
089            return extractedLine.substring(string.length(), extractedLine.length());
090        }
091        return null;
092    }
093
094    public void setPassword(String password) {
095        this.password = password;
096    }
097
098}