001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Fred Vadon
019 *     Miguel Nixo
020 */
021package org.nuxeo.ecm.platform.pdf.operations;
022
023import java.io.IOException;
024import org.nuxeo.ecm.automation.core.Constants;
025import org.nuxeo.ecm.automation.core.annotations.Context;
026import org.nuxeo.ecm.automation.core.annotations.Operation;
027import org.nuxeo.ecm.automation.core.annotations.OperationMethod;
028import org.nuxeo.ecm.automation.core.annotations.Param;
029import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector;
030import org.nuxeo.ecm.automation.core.util.DocumentHelper;
031import org.nuxeo.ecm.core.api.CoreSession;
032import org.nuxeo.ecm.core.api.DocumentModel;
033import org.nuxeo.ecm.platform.pdf.PDFTextExtractor;
034
035/**
036 * Extracts raw text from a PDF.
037 * <p>
038 * If the PDF is encrypted, a password is required.
039 *
040 * @since 8.10
041 */
042@Operation(id = PDFExtractTextOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Text",
043    description = "Extracts raw text from a PDF. If the PDF is encrypted, a password is required.")
044public class PDFExtractTextOperation {
045
046    public static final String ID = "PDF.ExtractText";
047
048    @Context
049    protected CoreSession session;
050
051    @Param(name = "pdfxpath", required = false)
052    protected String pdfxpath = "file:content";
053
054    @Param(name = "save", required = false)
055    protected boolean save = false;
056
057    @Param(name = "targetxpath", required = false)
058    protected String targetxpath;
059
060    @Param(name = "patterntofind", required = false)
061    protected String patterntofind;
062
063    @Param(name = "removepatternfromresult", required = false)
064    protected boolean removepatternfromresult = false;
065
066    @Param(name = "password", required = false)
067    protected String password = null;
068
069    @OperationMethod(collector = DocumentModelCollector.class)
070    public DocumentModel run(DocumentModel input) throws IOException {
071        PDFTextExtractor textExtractor = new PDFTextExtractor(input, pdfxpath);
072        textExtractor.setPassword(password);
073        String extractedText = removepatternfromresult ?
074            textExtractor.extractLastPartOfLine(patterntofind) : textExtractor.extractLineOf(patterntofind);
075        if (extractedText != null) {
076            input.setPropertyValue(targetxpath, extractedText);
077        } else {
078            DocumentHelper.removeProperty(input, targetxpath);
079        }
080        if (save) {
081            input = session.saveDocument(input);
082        }
083        return input;
084    }
085
086}