001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Fred Vadon 019 * Miguel Nixo 020 */ 021package org.nuxeo.ecm.platform.pdf.operations; 022 023import java.io.IOException; 024import org.nuxeo.ecm.automation.core.Constants; 025import org.nuxeo.ecm.automation.core.annotations.Context; 026import org.nuxeo.ecm.automation.core.annotations.Operation; 027import org.nuxeo.ecm.automation.core.annotations.OperationMethod; 028import org.nuxeo.ecm.automation.core.annotations.Param; 029import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector; 030import org.nuxeo.ecm.automation.core.util.DocumentHelper; 031import org.nuxeo.ecm.core.api.CoreSession; 032import org.nuxeo.ecm.core.api.DocumentModel; 033import org.nuxeo.ecm.platform.pdf.PDFTextExtractor; 034 035/** 036 * Extracts raw text from a PDF. 037 * <p> 038 * If the PDF is encrypted, a password is required. 039 * 040 * @since 8.10 041 */ 042@Operation(id = PDFExtractTextOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Text", 043 description = "Extracts raw text from a PDF. If the PDF is encrypted, a password is required.") 044public class PDFExtractTextOperation { 045 046 public static final String ID = "PDF.ExtractText"; 047 048 @Context 049 protected CoreSession session; 050 051 @Param(name = "pdfxpath", required = false) 052 protected String pdfxpath = "file:content"; 053 054 @Param(name = "save", required = false) 055 protected boolean save = false; 056 057 @Param(name = "targetxpath", required = false) 058 protected String targetxpath; 059 060 @Param(name = "patterntofind", required = false) 061 protected String patterntofind; 062 063 @Param(name = "removepatternfromresult", required = false) 064 protected boolean removepatternfromresult = false; 065 066 @Param(name = "password", required = false) 067 protected String password = null; 068 069 @OperationMethod(collector = DocumentModelCollector.class) 070 public DocumentModel run(DocumentModel input) throws IOException { 071 PDFTextExtractor textExtractor = new PDFTextExtractor(input, pdfxpath); 072 textExtractor.setPassword(password); 073 String extractedText = removepatternfromresult ? 074 textExtractor.extractLastPartOfLine(patterntofind) : textExtractor.extractLineOf(patterntofind); 075 if (extractedText != null) { 076 input.setPropertyValue(targetxpath, extractedText); 077 } else { 078 DocumentHelper.removeProperty(input, targetxpath); 079 } 080 if (save) { 081 input = session.saveDocument(input); 082 } 083 return input; 084 } 085 086}