001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Fred Vadon 019 * Miguel Nixo 020 */ 021package org.nuxeo.ecm.platform.pdf.operations; 022 023import java.io.IOException; 024 025import org.apache.commons.lang3.StringUtils; 026import org.nuxeo.ecm.automation.core.Constants; 027import org.nuxeo.ecm.automation.core.annotations.Context; 028import org.nuxeo.ecm.automation.core.annotations.Operation; 029import org.nuxeo.ecm.automation.core.annotations.OperationMethod; 030import org.nuxeo.ecm.automation.core.annotations.Param; 031import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector; 032import org.nuxeo.ecm.automation.core.util.DocumentHelper; 033import org.nuxeo.ecm.core.api.CoreSession; 034import org.nuxeo.ecm.core.api.DocumentModel; 035import org.nuxeo.ecm.platform.pdf.PDFTextExtractor; 036 037/** 038 * Extracts raw text from a PDF. 039 * <p> 040 * If the PDF is encrypted, a password is required. 041 * 042 * @since 8.10 043 */ 044@Operation(id = PDFExtractTextOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Text", description = "Extracts raw text from a PDF." 045 + " If the PDF is encrypted, a password is required." 046 + " pdfxpath is the xpath of the blob (default to file:content)." 047 + " The extracted text is set in the targetxpath property of the input document, which is saved if save is true." 048 + " If patterntofind is not provided, extracts all the text it can, else it extracts only the line where the pattern is found." 049 + " If patterntofind is provided and removepatternfromresult is true, the line is returned without the pattern.") 050public class PDFExtractTextOperation { 051 052 public static final String ID = "PDF.ExtractText"; 053 054 @Context 055 protected CoreSession session; 056 057 @Param(name = "pdfxpath", required = false) 058 protected String pdfxpath = "file:content"; 059 060 @Param(name = "save", required = false) 061 protected boolean save = false; 062 063 @Param(name = "targetxpath", required = false) 064 protected String targetxpath; 065 066 @Param(name = "patterntofind", required = false) 067 protected String patterntofind; 068 069 @Param(name = "removepatternfromresult", required = false) 070 protected boolean removepatternfromresult = false; 071 072 @Param(name = "password", required = false) 073 protected String password = null; 074 075 @OperationMethod(collector = DocumentModelCollector.class) 076 public DocumentModel run(DocumentModel input) throws IOException { 077 PDFTextExtractor textExtractor = new PDFTextExtractor(input, pdfxpath); 078 textExtractor.setPassword(password); 079 String extractedText; 080 if (StringUtils.isBlank(patterntofind)) { 081 extractedText = textExtractor.getAllExtractedLines(); 082 } else if (removepatternfromresult) { 083 extractedText = textExtractor.extractLastPartOfLine(patterntofind); 084 } else { 085 extractedText = textExtractor.extractLineOf(patterntofind); 086 } 087 if (extractedText != null) { 088 input.setPropertyValue(targetxpath, extractedText); 089 } else { 090 DocumentHelper.removeProperty(input, targetxpath); 091 } 092 if (save) { 093 input = session.saveDocument(input); 094 } 095 return input; 096 } 097 098}