001/*
002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Florent Guillaume
017 *     Thierry Delprat
018 */
019package org.nuxeo.ecm.platform.convert.plugins;
020
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.Serializable;
026import java.util.ArrayList;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.regex.Matcher;
031import java.util.regex.Pattern;
032
033import org.apache.commons.lang.StringUtils;
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036import org.artofsolving.jodconverter.OfficeDocumentConverter;
037import org.artofsolving.jodconverter.StandardConversionTask;
038import org.artofsolving.jodconverter.document.DocumentFamily;
039import org.artofsolving.jodconverter.document.DocumentFormat;
040
041import org.nuxeo.common.utils.FileUtils;
042import org.nuxeo.ecm.core.api.Blob;
043import org.nuxeo.ecm.core.api.Blobs;
044import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
045import org.nuxeo.ecm.core.convert.api.ConversionException;
046import org.nuxeo.ecm.core.convert.api.ConverterCheckResult;
047import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder;
048import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
049import org.nuxeo.ecm.core.convert.extension.ExternalConverter;
050import org.nuxeo.ecm.platform.convert.ooomanager.OOoManagerService;
051import org.nuxeo.ecm.platform.mimetype.interfaces.MimetypeRegistry;
052import org.nuxeo.runtime.api.Framework;
053
054/**
055 * Converter based on JOD which uses an external OpenOffice process to do actual conversions.
056 */
057public class JODBasedConverter implements ExternalConverter {
058
059    protected static final String TMP_PATH_PARAMETER = "TmpDirectory";
060
061    private static final Log log = LogFactory.getLog(JODBasedConverter.class);
062
063    /**
064     * Boolean conversion parameter for PDF/A-1.
065     *
066     * @since 5.6
067     */
068    public static final String PDFA1_PARAM = "PDF/A-1";
069
070    /**
071     * Boolean parameter to force update of the document TOC
072     *
073     * @since 5.6
074     */
075    public static final String UPDATE_INDEX_PARAM = StandardConversionTask.UPDATE_DOCUMENT_INDEX;
076
077    protected static final Map<DocumentFamily, String> PDF_FILTER_NAMES = new HashMap<DocumentFamily, String>();
078    {
079        PDF_FILTER_NAMES.put(DocumentFamily.TEXT, "writer_pdf_Export");
080        PDF_FILTER_NAMES.put(DocumentFamily.SPREADSHEET, "calc_pdf_Export");
081        PDF_FILTER_NAMES.put(DocumentFamily.PRESENTATION, "impress_pdf_Export");
082        PDF_FILTER_NAMES.put(DocumentFamily.DRAWING, "draw_pdf_Export");
083    }
084
085    protected ConverterDescriptor descriptor;
086
087    protected String getDestinationMimeType() {
088        return descriptor.getDestinationMimeType();
089    }
090
091    /**
092     * Returns the destination format for the given plugin.
093     * <p>
094     * It takes the actual destination mimetype from the plugin configuration.
095     *
096     * @param sourceFormat the source format
097     * @param pdfa1 true if PDF/A-1 is required
098     */
099    protected DocumentFormat getDestinationFormat(OfficeDocumentConverter documentConverter,
100            DocumentFormat sourceFormat, boolean pdfa1) {
101        String mimeType = getDestinationMimeType();
102        DocumentFormat destinationFormat = documentConverter.getFormatRegistry().getFormatByMediaType(mimeType);
103        if ("application/pdf".equals(mimeType)) {
104            destinationFormat = extendPDFFormat(sourceFormat, destinationFormat, pdfa1);
105        }
106        return destinationFormat;
107    }
108
109    protected DocumentFormat extendPDFFormat(DocumentFormat sourceFormat, DocumentFormat defaultFormat, boolean pdfa1) {
110        DocumentFamily sourceFamily = sourceFormat.getInputFamily();
111        String sourceMediaType = sourceFormat.getMediaType();
112        DocumentFormat pdfFormat = new DocumentFormat(pdfa1 ? "PDF/A-1" : "PDF", "pdf", "application/pdf");
113        Map<DocumentFamily, Map<String, ?>> storePropertiesByFamily = new HashMap<DocumentFamily, Map<String, ?>>();
114        Map<DocumentFamily, Map<String, ?>> defaultStorePropertiesByFamily = defaultFormat.getStorePropertiesByFamily();
115        for (DocumentFamily family : defaultStorePropertiesByFamily.keySet()) {
116            if (family.equals(sourceFamily)) {
117                continue;
118            }
119            storePropertiesByFamily.put(family, defaultStorePropertiesByFamily.get(family));
120        }
121        storePropertiesByFamily.put(sourceFamily,
122                extendPDFStoreProperties(sourceMediaType, pdfa1, defaultStorePropertiesByFamily.get(sourceFamily)));
123        pdfFormat.setStorePropertiesByFamily(storePropertiesByFamily);
124        return pdfFormat;
125    }
126
127    protected Map<String, Object> extendPDFStoreProperties(String mediatype, boolean pdfa1,
128            Map<String, ?> originalProperties) {
129        Map<String, Object> extendedProperties = new HashMap<String, Object>();
130        for (Map.Entry<String, ?> entry : originalProperties.entrySet()) {
131            extendedProperties.put(entry.getKey(), entry.getValue());
132        }
133        if ("text/html".equals(mediatype)) {
134            extendedProperties.put("FilterName", "writer_web_pdf_Export");
135        }
136        if (pdfa1) {
137            Map<String, Object> filterData = new HashMap<String, Object>();
138            filterData.put("SelectPdfVersion", Integer.valueOf(1)); // PDF/A-1
139            filterData.put("UseTaggedPDF", Boolean.TRUE); // per spec
140            extendedProperties.put("FilterData", filterData);
141        }
142        return extendedProperties;
143    }
144
145    /**
146     * Returns the format for the file passed as a parameter.
147     * <p>
148     * We will ask the mimetype registry service to sniff its mimetype.
149     *
150     * @return DocumentFormat for the given file
151     */
152    private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, File file) {
153        MimetypeRegistry mimetypeRegistry = Framework.getService(MimetypeRegistry.class);
154        String mimetypeStr = mimetypeRegistry.getMimetypeFromFile(file);
155        DocumentFormat format = documentConverter.getFormatRegistry().getFormatByMediaType(mimetypeStr);
156        return format;
157    }
158
159    /**
160     * Returns the DocumentFormat for the given mimetype.
161     *
162     * @return DocumentFormat for the given mimetype
163     */
164    private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, String mimetype) {
165        return documentConverter.getFormatRegistry().getFormatByMediaType(mimetype);
166    }
167
168    @Override
169    protected void finalize() throws Throwable {
170        super.finalize();
171    }
172
173    @Override
174    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
175        blobHolder = new UTF8CharsetConverter().convert(blobHolder, parameters);
176        Blob inputBlob = blobHolder.getBlob();
177        String blobPath = blobHolder.getFilePath();
178        if (inputBlob == null) {
179            return null;
180        }
181
182        OfficeDocumentConverter documentConverter = newDocumentConverter();
183        // This plugin do deal only with one input source.
184        String sourceMimetype = inputBlob.getMimeType();
185
186        boolean pdfa1 = parameters != null && Boolean.TRUE.equals(parameters.get(PDFA1_PARAM));
187
188        File sourceFile = null;
189        File outFile = null;
190        File[] files = null;
191        try {
192
193            // If the input blob has the HTML mime type, make sure the
194            // charset meta is present, add it if not
195            if ("text/html".equals(sourceMimetype)) {
196                inputBlob = checkCharsetMeta(inputBlob);
197            }
198
199            // Get original file extension
200            String ext = inputBlob.getFilename();
201            int dotPosition = ext.lastIndexOf('.');
202            if (dotPosition == -1) {
203                ext = ".bin";
204            } else {
205                ext = ext.substring(dotPosition);
206            }
207            // Copy in a file to be able to read it several time
208            sourceFile = File.createTempFile("NXJOOoConverterDocumentIn", ext);
209            InputStream stream = inputBlob.getStream();
210            FileUtils.copyToFile(stream, sourceFile);
211            stream.close();
212
213            DocumentFormat sourceFormat = null;
214            if (sourceMimetype != null) {
215                // Try to fetch it from the registry.
216                sourceFormat = getSourceFormat(documentConverter, sourceMimetype);
217            }
218            // If not found in the registry or not given as a parameter.
219            // Try to sniff ! What does that smell ? :)
220            if (sourceFormat == null) {
221                sourceFormat = getSourceFormat(documentConverter, sourceFile);
222            }
223
224            // From plugin settings because we know the destination
225            // mimetype.
226            DocumentFormat destinationFormat = getDestinationFormat(documentConverter, sourceFormat, pdfa1);
227
228            // allow HTML2PDF filtering
229
230            List<Blob> blobs = new ArrayList<Blob>();
231
232            if (descriptor.getDestinationMimeType().equals("text/html")) {
233                String tmpDirPath = getTmpDirectory();
234                File myTmpDir = new File(tmpDirPath + "/JODConv_" + System.currentTimeMillis());
235                boolean created = myTmpDir.mkdir();
236                if (!created) {
237                    throw new IOException("Unable to create temp dir");
238                }
239
240                outFile = new File(myTmpDir.getAbsolutePath() + "/" + "NXJOOoConverterDocumentOut."
241                        + destinationFormat.getExtension());
242
243                created = outFile.createNewFile();
244                if (!created) {
245                    throw new IOException("Unable to create temp file");
246                }
247
248                log.debug("Input File = " + outFile.getAbsolutePath());
249                // Perform the actual conversion.
250                documentConverter.convert(sourceFile, outFile, destinationFormat);
251
252                files = myTmpDir.listFiles();
253                for (File file : files) {
254                    // copy the files to a new tmp location, as we'll delete them
255                    Blob blob;
256                    try (FileInputStream in = new FileInputStream(file)) {
257                        blob = Blobs.createBlob(in);
258                    }
259                    blob.setFilename(file.getName());
260                    blobs.add(blob);
261                    // add a blob for the index
262                    if (file.getName().equals(outFile.getName())) {
263                        Blob indexBlob;
264                        try (FileInputStream in = new FileInputStream(file)) {
265                            indexBlob = Blobs.createBlob(in);
266                        }
267                        indexBlob.setFilename("index.html");
268                        blobs.add(0, indexBlob);
269                    }
270                }
271
272            } else {
273                outFile = File.createTempFile("NXJOOoConverterDocumentOut", '.' + destinationFormat.getExtension());
274
275                // Perform the actual conversion.
276                documentConverter.convert(sourceFile, outFile, destinationFormat, parameters);
277
278                Blob blob;
279                try (FileInputStream in = new FileInputStream(outFile)) {
280                    blob = Blobs.createBlob(in, getDestinationMimeType());
281                }
282                blobs.add(blob);
283            }
284            return new SimpleCachableBlobHolder(blobs);
285        } catch (IOException e) {
286            String msg = String.format("An error occurred trying to convert file %s to from %s to %s", blobPath,
287                    sourceMimetype, getDestinationMimeType());
288            throw new ConversionException(msg, e);
289        } finally {
290            if (sourceFile != null) {
291                sourceFile.delete();
292            }
293            if (outFile != null) {
294                outFile.delete();
295            }
296
297            if (files != null) {
298                for (File file : files) {
299                    if (file.exists()) {
300                        file.delete();
301                    }
302                }
303            }
304        }
305
306    }
307
308    protected OfficeDocumentConverter newDocumentConverter() throws ConversionException {
309        OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class);
310        OfficeDocumentConverter documentConverter = oooManagerService.getDocumentConverter();
311        if (documentConverter == null) {
312            throw new ConversionException("Could not connect to the remote OpenOffice server");
313        }
314        return documentConverter;
315    }
316
317    @Override
318    public void init(ConverterDescriptor descriptor) {
319        this.descriptor = descriptor;
320    }
321
322    @Override
323    public ConverterCheckResult isConverterAvailable() {
324        ConverterCheckResult result = new ConverterCheckResult();
325        OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class);
326        if (!oooManagerService.isOOoManagerStarted()) {
327            result.setAvailable(false);
328        }
329        return result;
330    }
331
332    protected String getTmpDirectory() {
333        String tmp = null;
334        Map<String, String> parameters = descriptor.getParameters();
335        if (parameters != null && parameters.containsKey(TMP_PATH_PARAMETER)) {
336            tmp = parameters.get(TMP_PATH_PARAMETER);
337        }
338        if (tmp == null) {
339            tmp = System.getProperty("java.io.tmpdir");
340        }
341        return tmp;
342    }
343
344    /**
345     * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it.
346     *
347     * @param inputBlob the input blob
348     * @throws IOException Signals that an I/O exception has occurred.
349     */
350    protected Blob checkCharsetMeta(Blob inputBlob) throws IOException {
351
352        String charset = inputBlob.getEncoding();
353        if (!StringUtils.isEmpty(charset)) {
354            Pattern charsetMetaPattern = Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset));
355            Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString());
356            if (!charsetMetaMatcher.find()) {
357                String charsetMetaTag = String.format(
358                        "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset);
359                StringBuilder sb = new StringBuilder(charsetMetaTag);
360                sb.append(new String(inputBlob.getByteArray(), charset));
361                Blob blobWithCharsetMetaTag = Blobs.createBlob(sb.toString(), "text/html", charset,
362                        inputBlob.getFilename());
363                return blobWithCharsetMetaTag;
364            }
365        }
366        return inputBlob;
367    }
368}