001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Thibaud Arguillere 018 * Miguel Nixo 019 */ 020package org.nuxeo.ecm.platform.pdf.operations; 021 022import java.io.IOException; 023import org.nuxeo.ecm.automation.core.Constants; 024import org.nuxeo.ecm.automation.core.annotations.Context; 025import org.nuxeo.ecm.automation.core.annotations.Operation; 026import org.nuxeo.ecm.automation.core.annotations.OperationMethod; 027import org.nuxeo.ecm.automation.core.annotations.Param; 028import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector; 029import org.nuxeo.ecm.automation.core.util.Properties; 030import org.nuxeo.ecm.core.api.Blob; 031import org.nuxeo.ecm.core.api.CoreSession; 032import org.nuxeo.ecm.core.api.DocumentModel; 033import org.nuxeo.ecm.platform.pdf.PDFInfo; 034 035/** 036 * Extracts PDF info to specific fields. 037 * <p> 038 * If there is no blob or if the blob is not a PDF document, we empty the values. 039 * <p> 040 * <b>IMPORTANT</b> We don't check if the blob is a PDF or not. If it is not, this will likely lead to PDFBox errors. 041 * <p> 042 * For the values to use in the properties parameter, see {@link PDFInfo#toHashMap}. 043 * 044 * @since 8.10 045 */ 046@Operation(id = PDFExtractInfoOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Info", 047 description = "Extract the info of the PDF stored in <code>xpath</code> and put it in the fields referenced by " + 048 "<code>properties</code>. <code>properties</code> is a <code>key=value</code> list (one key-value pair/line, " + 049 "where <code>key</code> is the xpath of the destination field and <code>value</code> is the exact label " + 050 "(case sensitive) as returned by the PageExtractor (see this operation documentation). If there is no blob " + 051 "or the blob is not a PDF, all the values referenced in <code>properties</code> are cleared (set to empty " + 052 "string, 0, ...).") 053public class PDFExtractInfoOperation { 054 055 public static final String ID = "PDF.ExtractInfo"; 056 057 @Context 058 protected CoreSession session; 059 060 @Param(name = "xpath", required = false, values = { "file:content" }) 061 protected String xpath = "file:content"; 062 063 // The map has the xpath as key and the metadata property as value. 064 // For example, say we have a custom pdfinfo schema: 065 // pdfinfo:title=Title 066 // pdfinfo:producer=PDF Producer 067 // pdfinfo:mediabox_width=Media box width 068 // ... 069 @Param(name = "properties", required = false) 070 protected Properties properties; 071 072 @Param(name = "save", required = false, values = { "true" }) 073 protected boolean save = true; 074 075 @OperationMethod(collector = DocumentModelCollector.class) 076 public DocumentModel run(DocumentModel inDoc) throws IOException { 077 // Get the blob 078 // If there is no blob, we empty all the values 079 if (properties == null) { 080 properties = new Properties(); 081 } 082 Blob theBlob = (Blob) inDoc.getPropertyValue(xpath); 083 if (theBlob == null || (theBlob.getMimeType() != null && !theBlob.getMimeType().equals("application/pdf"))) { 084 for (String inXPath : properties.keySet()) { 085 inDoc.setPropertyValue(inXPath, ""); 086 } 087 if (save) { 088 session.saveDocument(inDoc); 089 } 090 } else { 091 PDFInfo info = new PDFInfo(inDoc); 092 inDoc = info.toFields(inDoc, properties, save, session); 093 } 094 return inDoc; 095 } 096 097}