001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo
018 *     Florent Guillaume
019 *     Thierry Delprat
020 */
021package org.nuxeo.ecm.platform.convert.plugins;
022
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.Serializable;
028import java.util.ArrayList;
029import java.util.HashMap;
030import java.util.List;
031import java.util.Map;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.apache.commons.lang.StringUtils;
036import org.apache.commons.logging.Log;
037import org.apache.commons.logging.LogFactory;
038import org.artofsolving.jodconverter.OfficeDocumentConverter;
039import org.artofsolving.jodconverter.StandardConversionTask;
040import org.artofsolving.jodconverter.document.DocumentFamily;
041import org.artofsolving.jodconverter.document.DocumentFormat;
042
043import org.nuxeo.common.Environment;
044import org.nuxeo.common.utils.FileUtils;
045import org.nuxeo.ecm.core.api.Blob;
046import org.nuxeo.ecm.core.api.Blobs;
047import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
048import org.nuxeo.ecm.core.convert.api.ConversionException;
049import org.nuxeo.ecm.core.convert.api.ConverterCheckResult;
050import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder;
051import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
052import org.nuxeo.ecm.core.convert.extension.ExternalConverter;
053import org.nuxeo.ecm.platform.convert.ooomanager.OOoManagerService;
054import org.nuxeo.ecm.platform.mimetype.interfaces.MimetypeRegistry;
055import org.nuxeo.runtime.api.Framework;
056
057/**
058 * Converter based on JOD which uses an external OpenOffice process to do actual conversions.
059 */
060public class JODBasedConverter implements ExternalConverter {
061
062    protected static final String TMP_PATH_PARAMETER = "TmpDirectory";
063
064    private static final Log log = LogFactory.getLog(JODBasedConverter.class);
065
066    /**
067     * Boolean conversion parameter for PDF/A-1.
068     *
069     * @since 5.6
070     */
071    public static final String PDFA1_PARAM = "PDF/A-1";
072
073    /**
074     * Boolean parameter to force update of the document TOC
075     *
076     * @since 5.6
077     */
078    public static final String UPDATE_INDEX_PARAM = StandardConversionTask.UPDATE_DOCUMENT_INDEX;
079
080    protected static final Map<DocumentFamily, String> PDF_FILTER_NAMES = new HashMap<>();
081    {
082        PDF_FILTER_NAMES.put(DocumentFamily.TEXT, "writer_pdf_Export");
083        PDF_FILTER_NAMES.put(DocumentFamily.SPREADSHEET, "calc_pdf_Export");
084        PDF_FILTER_NAMES.put(DocumentFamily.PRESENTATION, "impress_pdf_Export");
085        PDF_FILTER_NAMES.put(DocumentFamily.DRAWING, "draw_pdf_Export");
086    }
087
088    protected ConverterDescriptor descriptor;
089
090    protected String getDestinationMimeType() {
091        return descriptor.getDestinationMimeType();
092    }
093
094    /**
095     * Returns the destination format for the given plugin.
096     * <p>
097     * It takes the actual destination mimetype from the plugin configuration.
098     *
099     * @param sourceFormat the source format
100     * @param pdfa1 true if PDF/A-1 is required
101     */
102    protected DocumentFormat getDestinationFormat(OfficeDocumentConverter documentConverter,
103            DocumentFormat sourceFormat, boolean pdfa1) {
104        String mimeType = getDestinationMimeType();
105        DocumentFormat destinationFormat = documentConverter.getFormatRegistry().getFormatByMediaType(mimeType);
106        if ("application/pdf".equals(mimeType)) {
107            destinationFormat = extendPDFFormat(sourceFormat, destinationFormat, pdfa1);
108        }
109        return destinationFormat;
110    }
111
112    protected DocumentFormat extendPDFFormat(DocumentFormat sourceFormat, DocumentFormat defaultFormat, boolean pdfa1) {
113        DocumentFamily sourceFamily = sourceFormat.getInputFamily();
114        String sourceMediaType = sourceFormat.getMediaType();
115        DocumentFormat pdfFormat = new DocumentFormat(pdfa1 ? "PDF/A-1" : "PDF", "pdf", "application/pdf");
116        Map<DocumentFamily, Map<String, ?>> storePropertiesByFamily = new HashMap<>();
117        Map<DocumentFamily, Map<String, ?>> defaultStorePropertiesByFamily = defaultFormat.getStorePropertiesByFamily();
118        for (DocumentFamily family : defaultStorePropertiesByFamily.keySet()) {
119            if (family.equals(sourceFamily)) {
120                continue;
121            }
122            storePropertiesByFamily.put(family, defaultStorePropertiesByFamily.get(family));
123        }
124        storePropertiesByFamily.put(sourceFamily,
125                extendPDFStoreProperties(sourceMediaType, pdfa1, defaultStorePropertiesByFamily.get(sourceFamily)));
126        pdfFormat.setStorePropertiesByFamily(storePropertiesByFamily);
127        return pdfFormat;
128    }
129
130    protected Map<String, Object> extendPDFStoreProperties(String mediatype, boolean pdfa1,
131            Map<String, ?> originalProperties) {
132        Map<String, Object> extendedProperties = new HashMap<>();
133        for (Map.Entry<String, ?> entry : originalProperties.entrySet()) {
134            extendedProperties.put(entry.getKey(), entry.getValue());
135        }
136        if ("text/html".equals(mediatype)) {
137            extendedProperties.put("FilterName", "writer_web_pdf_Export");
138        }
139        if (pdfa1) {
140            Map<String, Object> filterData = new HashMap<>();
141            filterData.put("SelectPdfVersion", Integer.valueOf(1)); // PDF/A-1
142            filterData.put("UseTaggedPDF", Boolean.TRUE); // per spec
143            extendedProperties.put("FilterData", filterData);
144        }
145        return extendedProperties;
146    }
147
148    /**
149     * Returns the format for the file passed as a parameter.
150     * <p>
151     * We will ask the mimetype registry service to sniff its mimetype.
152     *
153     * @return DocumentFormat for the given file
154     */
155    private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, File file) {
156        MimetypeRegistry mimetypeRegistry = Framework.getService(MimetypeRegistry.class);
157        String mimetypeStr = mimetypeRegistry.getMimetypeFromFile(file);
158        DocumentFormat format = documentConverter.getFormatRegistry().getFormatByMediaType(mimetypeStr);
159        return format;
160    }
161
162    /**
163     * Returns the DocumentFormat for the given mimetype.
164     *
165     * @return DocumentFormat for the given mimetype
166     */
167    private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, String mimetype) {
168        return documentConverter.getFormatRegistry().getFormatByMediaType(mimetype);
169    }
170
171    @Override
172    protected void finalize() throws Throwable {
173        super.finalize();
174    }
175
176    @Override
177    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
178        blobHolder = new UTF8CharsetConverter().convert(blobHolder, parameters);
179        Blob inputBlob = blobHolder.getBlob();
180        String blobPath = blobHolder.getFilePath();
181        if (inputBlob == null) {
182            return null;
183        }
184
185        OfficeDocumentConverter documentConverter = newDocumentConverter();
186        // This plugin do deal only with one input source.
187        String sourceMimetype = inputBlob.getMimeType();
188
189        boolean pdfa1 = parameters != null && Boolean.TRUE.equals(parameters.get(PDFA1_PARAM));
190
191        File sourceFile = null;
192        File outFile = null;
193        File[] files = null;
194        try {
195
196            // If the input blob has the HTML mime type, make sure the
197            // charset meta is present, add it if not
198            if ("text/html".equals(sourceMimetype)) {
199                inputBlob = checkCharsetMeta(inputBlob);
200            }
201
202            // Get original file extension
203            String ext = inputBlob.getFilename();
204            int dotPosition = ext.lastIndexOf('.');
205            if (dotPosition == -1) {
206                ext = ".bin";
207            } else {
208                ext = ext.substring(dotPosition);
209            }
210            // Copy in a file to be able to read it several time
211            sourceFile = Framework.createTempFile("NXJOOoConverterDocumentIn", ext);
212            InputStream stream = inputBlob.getStream();
213            FileUtils.copyToFile(stream, sourceFile);
214            stream.close();
215
216            DocumentFormat sourceFormat = null;
217            if (sourceMimetype != null) {
218                // Try to fetch it from the registry.
219                sourceFormat = getSourceFormat(documentConverter, sourceMimetype);
220            }
221            // If not found in the registry or not given as a parameter.
222            // Try to sniff ! What does that smell ? :)
223            if (sourceFormat == null) {
224                sourceFormat = getSourceFormat(documentConverter, sourceFile);
225            }
226
227            // From plugin settings because we know the destination
228            // mimetype.
229            DocumentFormat destinationFormat = getDestinationFormat(documentConverter, sourceFormat, pdfa1);
230
231            // allow HTML2PDF filtering
232
233            List<Blob> blobs = new ArrayList<>();
234
235            if (descriptor.getDestinationMimeType().equals("text/html")) {
236                String tmpDirPath = getTmpDirectory();
237                File myTmpDir = new File(tmpDirPath + "/JODConv_" + System.currentTimeMillis());
238                boolean created = myTmpDir.mkdir();
239                if (!created) {
240                    throw new IOException("Unable to create temp dir");
241                }
242
243                outFile = new File(myTmpDir.getAbsolutePath() + "/" + "NXJOOoConverterDocumentOut."
244                        + destinationFormat.getExtension());
245
246                created = outFile.createNewFile();
247                if (!created) {
248                    throw new IOException("Unable to create temp file");
249                }
250
251                log.debug("Input File = " + outFile.getAbsolutePath());
252                // Perform the actual conversion.
253                documentConverter.convert(sourceFile, outFile, destinationFormat);
254
255                files = myTmpDir.listFiles();
256                for (File file : files) {
257                    // copy the files to a new tmp location, as we'll delete them
258                    Blob blob;
259                    try (FileInputStream in = new FileInputStream(file)) {
260                        blob = Blobs.createBlob(in);
261                    }
262                    blob.setFilename(file.getName());
263                    blobs.add(blob);
264                    // add a blob for the index
265                    if (file.getName().equals(outFile.getName())) {
266                        Blob indexBlob;
267                        try (FileInputStream in = new FileInputStream(file)) {
268                            indexBlob = Blobs.createBlob(in);
269                        }
270                        indexBlob.setFilename("index.html");
271                        blobs.add(0, indexBlob);
272                    }
273                }
274
275            } else {
276                outFile = Framework.createTempFile("NXJOOoConverterDocumentOut", '.' + destinationFormat.getExtension());
277
278                // Perform the actual conversion.
279                documentConverter.convert(sourceFile, outFile, destinationFormat, parameters);
280
281                Blob blob;
282                try (FileInputStream in = new FileInputStream(outFile)) {
283                    blob = Blobs.createBlob(in, getDestinationMimeType());
284                }
285                blobs.add(blob);
286            }
287            return new SimpleCachableBlobHolder(blobs);
288        } catch (IOException e) {
289            String msg = String.format("An error occurred trying to convert file %s to from %s to %s", blobPath,
290                    sourceMimetype, getDestinationMimeType());
291            throw new ConversionException(msg, e);
292        } finally {
293            if (sourceFile != null) {
294                sourceFile.delete();
295            }
296            if (outFile != null) {
297                outFile.delete();
298            }
299
300            if (files != null) {
301                for (File file : files) {
302                    if (file.exists()) {
303                        file.delete();
304                    }
305                }
306            }
307        }
308
309    }
310
311    protected OfficeDocumentConverter newDocumentConverter() throws ConversionException {
312        OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class);
313        OfficeDocumentConverter documentConverter = oooManagerService.getDocumentConverter();
314        if (documentConverter == null) {
315            throw new ConversionException("Could not connect to the remote OpenOffice server");
316        }
317        return documentConverter;
318    }
319
320    @SuppressWarnings("hiding")
321    @Override
322    public void init(ConverterDescriptor descriptor) {
323        this.descriptor = descriptor;
324    }
325
326    @Override
327    public ConverterCheckResult isConverterAvailable() {
328        ConverterCheckResult result = new ConverterCheckResult();
329        OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class);
330        if (!oooManagerService.isOOoManagerStarted()) {
331            result.setAvailable(false);
332        }
333        return result;
334    }
335
336    protected String getTmpDirectory() {
337        String tmp = null;
338        Map<String, String> parameters = descriptor.getParameters();
339        if (parameters != null && parameters.containsKey(TMP_PATH_PARAMETER)) {
340            tmp = parameters.get(TMP_PATH_PARAMETER);
341        }
342        if (tmp == null) {
343            tmp = Environment.getDefault().getTemp().getPath();
344        }
345        return tmp;
346    }
347
348    /**
349     * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it.
350     *
351     * @param inputBlob the input blob
352     * @throws IOException Signals that an I/O exception has occurred.
353     */
354    protected Blob checkCharsetMeta(Blob inputBlob) throws IOException {
355
356        String charset = inputBlob.getEncoding();
357        if (!StringUtils.isEmpty(charset)) {
358            Pattern charsetMetaPattern = Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset));
359            Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString());
360            if (!charsetMetaMatcher.find()) {
361                String charsetMetaTag = String.format(
362                        "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset);
363                StringBuilder sb = new StringBuilder(charsetMetaTag);
364                sb.append(new String(inputBlob.getByteArray(), charset));
365                Blob blobWithCharsetMetaTag = Blobs.createBlob(sb.toString(), "text/html", charset,
366                        inputBlob.getFilename());
367                return blobWithCharsetMetaTag;
368            }
369        }
370        return inputBlob;
371    }
372}