Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Fred Vadon
019 *     Miguel Nixo
020 */
021package org.nuxeo.ecm.platform.pdf.operations;
022
023import java.io.IOException;
024
025import org.apache.commons.lang3.StringUtils;
026import org.nuxeo.ecm.automation.core.Constants;
027import org.nuxeo.ecm.automation.core.annotations.Context;
028import org.nuxeo.ecm.automation.core.annotations.Operation;
029import org.nuxeo.ecm.automation.core.annotations.OperationMethod;
030import org.nuxeo.ecm.automation.core.annotations.Param;
031import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector;
032import org.nuxeo.ecm.automation.core.util.DocumentHelper;
033import org.nuxeo.ecm.core.api.CoreSession;
034import org.nuxeo.ecm.core.api.DocumentModel;
035import org.nuxeo.ecm.platform.pdf.PDFTextExtractor;
036
037/**
038 * Extracts raw text from a PDF.
039 * <p>
040 * If the PDF is encrypted, a password is required.
041 *
042 * @since 8.10
043 */
044@Operation(id = PDFExtractTextOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Text", description = "Extracts raw text from a PDF."
045        + " If the PDF is encrypted, a password is required."
046        + " pdfxpath is the xpath of the blob (default to file:content)."
047        + " The extracted text is set in the targetxpath property of the input document, which is saved if save is true."
048        + " If patterntofind is not provided, extracts all the text it can, else it extracts only the line where the pattern is found."
049        + " If patterntofind is provided and removepatternfromresult is true, the line is returned without the pattern.")
050public class PDFExtractTextOperation {
051
052    public static final String ID = "PDF.ExtractText";
053
054    @Context
055    protected CoreSession session;
056
057    @Param(name = "pdfxpath", required = false)
058    protected String pdfxpath = "file:content";
059
060    @Param(name = "save", required = false)
061    protected boolean save = false;
062
063    @Param(name = "targetxpath", required = false)
064    protected String targetxpath;
065
066    @Param(name = "patterntofind", required = false)
067    protected String patterntofind;
068
069    @Param(name = "removepatternfromresult", required = false)
070    protected boolean removepatternfromresult = false;
071
072    @Param(name = "password", required = false)
073    protected String password = null;
074
075    @OperationMethod(collector = DocumentModelCollector.class)
076    public DocumentModel run(DocumentModel input) throws IOException {
077        PDFTextExtractor textExtractor = new PDFTextExtractor(input, pdfxpath);
078        textExtractor.setPassword(password);
079        String extractedText;
080        if (StringUtils.isBlank(patterntofind)) {
081            extractedText = textExtractor.getAllExtractedLines();
082        } else if (removepatternfromresult) {
083            extractedText = textExtractor.extractLastPartOfLine(patterntofind);
084        } else {
085            extractedText = textExtractor.extractLineOf(patterntofind);
086        }
087        if (extractedText != null) {
088            input.setPropertyValue(targetxpath, extractedText);
089        } else {
090            DocumentHelper.removeProperty(input, targetxpath);
091        }
092        if (save) {
093            input = session.saveDocument(input);
094        }
095        return input;
096    }
097
098}