Source code

001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Thibaud Arguillere
018 *     Miguel Nixo
019 */
020package org.nuxeo.ecm.platform.pdf.operations;
021
022import java.io.IOException;
023import org.nuxeo.ecm.automation.core.Constants;
024import org.nuxeo.ecm.automation.core.annotations.Context;
025import org.nuxeo.ecm.automation.core.annotations.Operation;
026import org.nuxeo.ecm.automation.core.annotations.OperationMethod;
027import org.nuxeo.ecm.automation.core.annotations.Param;
028import org.nuxeo.ecm.automation.core.collectors.DocumentModelCollector;
029import org.nuxeo.ecm.automation.core.util.Properties;
030import org.nuxeo.ecm.core.api.Blob;
031import org.nuxeo.ecm.core.api.CoreSession;
032import org.nuxeo.ecm.core.api.DocumentModel;
033import org.nuxeo.ecm.platform.pdf.PDFInfo;
034
035/**
036 * Extracts PDF info to specific fields.
037 * <p>
038 * If there is no blob or if the blob is not a PDF document, we empty the values.
039 * <p>
040 * <b>IMPORTANT</b> We don't check if the blob is a PDF or not. If it is not, this will likely lead to PDFBox errors.
041 * <p>
042 * For the values to use in the properties parameter, see {@link PDFInfo#toHashMap}.
043 *
044 * @since 8.10
045 */
046@Operation(id = PDFExtractInfoOperation.ID, category = Constants.CAT_DOCUMENT, label = "PDF: Extract Info",
047    description = "Extract the info of the PDF stored in <code>xpath</code> and put it in the fields referenced by " +
048        "<code>properties</code>. <code>properties</code> is a <code>key=value</code> list (one key-value pair/line, " +
049        "where <code>key</code> is the xpath of the destination field and <code>value</code> is the exact label " +
050        "(case sensitive) as returned by the PageExtractor (see this operation documentation). If there is no blob " +
051        "or the blob is not a PDF, all the values referenced in <code>properties</code> are cleared (set to empty " +
052        "string, 0, ...).")
053public class PDFExtractInfoOperation {
054
055    public static final String ID = "PDF.ExtractInfo";
056
057    @Context
058    protected CoreSession session;
059
060    @Param(name = "xpath", required = false, values = { "file:content" })
061    protected String xpath = "file:content";
062
063    // The map has the xpath as key and the metadata property as value.
064    // For example, say we have a custom pdfinfo schema:
065    // pdfinfo:title=Title
066    // pdfinfo:producer=PDF Producer
067    // pdfinfo:mediabox_width=Media box width
068    // ...
069    @Param(name = "properties", required = false)
070    protected Properties properties;
071
072    @Param(name = "save", required = false, values = { "true" })
073    protected boolean save = true;
074
075    @OperationMethod(collector = DocumentModelCollector.class)
076    public DocumentModel run(DocumentModel inDoc) throws IOException {
077        // Get the blob
078        // If there is no blob, we empty all the values
079        if (properties == null) {
080            properties = new Properties();
081        }
082        Blob theBlob = (Blob) inDoc.getPropertyValue(xpath);
083        if (theBlob == null || (theBlob.getMimeType() != null && !theBlob.getMimeType().equals("application/pdf"))) {
084            for (String inXPath : properties.keySet()) {
085                inDoc.setPropertyValue(inXPath, "");
086            }
087            if (save) {
088                session.saveDocument(inDoc);
089            }
090        } else {
091            PDFInfo info = new PDFInfo(inDoc);
092            inDoc = info.toFields(inDoc, properties, save, session);
093        }
094        return inDoc;
095    }
096
097}