001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Florent Guillaume 019 * Thierry Delprat 020 */ 021package org.nuxeo.ecm.platform.convert.plugins; 022 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.Serializable; 028import java.util.ArrayList; 029import java.util.HashMap; 030import java.util.List; 031import java.util.Map; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.commons.lang.StringUtils; 036import org.apache.commons.logging.Log; 037import org.apache.commons.logging.LogFactory; 038import org.artofsolving.jodconverter.OfficeDocumentConverter; 039import org.artofsolving.jodconverter.StandardConversionTask; 040import org.artofsolving.jodconverter.document.DocumentFamily; 041import org.artofsolving.jodconverter.document.DocumentFormat; 042 043import org.nuxeo.common.Environment; 044import org.nuxeo.common.utils.FileUtils; 045import org.nuxeo.ecm.core.api.Blob; 046import org.nuxeo.ecm.core.api.Blobs; 047import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 048import org.nuxeo.ecm.core.convert.api.ConversionException; 049import org.nuxeo.ecm.core.convert.api.ConverterCheckResult; 050import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 051import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 052import org.nuxeo.ecm.core.convert.extension.ExternalConverter; 053import org.nuxeo.ecm.platform.convert.ooomanager.OOoManagerService; 054import org.nuxeo.ecm.platform.mimetype.interfaces.MimetypeRegistry; 055import org.nuxeo.runtime.api.Framework; 056 057/** 058 * Converter based on JOD which uses an external OpenOffice process to do actual conversions. 059 */ 060public class JODBasedConverter implements ExternalConverter { 061 062 protected static final String TMP_PATH_PARAMETER = "TmpDirectory"; 063 064 private static final Log log = LogFactory.getLog(JODBasedConverter.class); 065 066 /** 067 * Boolean conversion parameter for PDF/A-1. 068 * 069 * @since 5.6 070 */ 071 public static final String PDFA1_PARAM = "PDF/A-1"; 072 073 /** 074 * Boolean parameter to force update of the document TOC 075 * 076 * @since 5.6 077 */ 078 public static final String UPDATE_INDEX_PARAM = StandardConversionTask.UPDATE_DOCUMENT_INDEX; 079 080 protected static final Map<DocumentFamily, String> PDF_FILTER_NAMES = new HashMap<>(); 081 { 082 PDF_FILTER_NAMES.put(DocumentFamily.TEXT, "writer_pdf_Export"); 083 PDF_FILTER_NAMES.put(DocumentFamily.SPREADSHEET, "calc_pdf_Export"); 084 PDF_FILTER_NAMES.put(DocumentFamily.PRESENTATION, "impress_pdf_Export"); 085 PDF_FILTER_NAMES.put(DocumentFamily.DRAWING, "draw_pdf_Export"); 086 } 087 088 protected ConverterDescriptor descriptor; 089 090 protected String getDestinationMimeType() { 091 return descriptor.getDestinationMimeType(); 092 } 093 094 /** 095 * Returns the destination format for the given plugin. 096 * <p> 097 * It takes the actual destination mimetype from the plugin configuration. 098 * 099 * @param sourceFormat the source format 100 * @param pdfa1 true if PDF/A-1 is required 101 */ 102 protected DocumentFormat getDestinationFormat(OfficeDocumentConverter documentConverter, 103 DocumentFormat sourceFormat, boolean pdfa1) { 104 String mimeType = getDestinationMimeType(); 105 DocumentFormat destinationFormat = documentConverter.getFormatRegistry().getFormatByMediaType(mimeType); 106 if ("application/pdf".equals(mimeType)) { 107 destinationFormat = extendPDFFormat(sourceFormat, destinationFormat, pdfa1); 108 } 109 return destinationFormat; 110 } 111 112 protected DocumentFormat extendPDFFormat(DocumentFormat sourceFormat, DocumentFormat defaultFormat, boolean pdfa1) { 113 DocumentFamily sourceFamily = sourceFormat.getInputFamily(); 114 String sourceMediaType = sourceFormat.getMediaType(); 115 DocumentFormat pdfFormat = new DocumentFormat(pdfa1 ? "PDF/A-1" : "PDF", "pdf", "application/pdf"); 116 Map<DocumentFamily, Map<String, ?>> storePropertiesByFamily = new HashMap<>(); 117 Map<DocumentFamily, Map<String, ?>> defaultStorePropertiesByFamily = defaultFormat.getStorePropertiesByFamily(); 118 for (DocumentFamily family : defaultStorePropertiesByFamily.keySet()) { 119 if (family.equals(sourceFamily)) { 120 continue; 121 } 122 storePropertiesByFamily.put(family, defaultStorePropertiesByFamily.get(family)); 123 } 124 storePropertiesByFamily.put(sourceFamily, 125 extendPDFStoreProperties(sourceMediaType, pdfa1, defaultStorePropertiesByFamily.get(sourceFamily))); 126 pdfFormat.setStorePropertiesByFamily(storePropertiesByFamily); 127 return pdfFormat; 128 } 129 130 protected Map<String, Object> extendPDFStoreProperties(String mediatype, boolean pdfa1, 131 Map<String, ?> originalProperties) { 132 Map<String, Object> extendedProperties = new HashMap<>(); 133 for (Map.Entry<String, ?> entry : originalProperties.entrySet()) { 134 extendedProperties.put(entry.getKey(), entry.getValue()); 135 } 136 if ("text/html".equals(mediatype)) { 137 extendedProperties.put("FilterName", "writer_web_pdf_Export"); 138 } 139 if (pdfa1) { 140 Map<String, Object> filterData = new HashMap<>(); 141 filterData.put("SelectPdfVersion", Integer.valueOf(1)); // PDF/A-1 142 filterData.put("UseTaggedPDF", Boolean.TRUE); // per spec 143 extendedProperties.put("FilterData", filterData); 144 } 145 return extendedProperties; 146 } 147 148 /** 149 * Returns the format for the file passed as a parameter. 150 * <p> 151 * We will ask the mimetype registry service to sniff its mimetype. 152 * 153 * @return DocumentFormat for the given file 154 */ 155 private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, File file) { 156 MimetypeRegistry mimetypeRegistry = Framework.getService(MimetypeRegistry.class); 157 String mimetypeStr = mimetypeRegistry.getMimetypeFromFile(file); 158 DocumentFormat format = documentConverter.getFormatRegistry().getFormatByMediaType(mimetypeStr); 159 return format; 160 } 161 162 /** 163 * Returns the DocumentFormat for the given mimetype. 164 * 165 * @return DocumentFormat for the given mimetype 166 */ 167 private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, String mimetype) { 168 return documentConverter.getFormatRegistry().getFormatByMediaType(mimetype); 169 } 170 171 @Override 172 protected void finalize() throws Throwable { 173 super.finalize(); 174 } 175 176 @Override 177 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 178 blobHolder = new UTF8CharsetConverter().convert(blobHolder, parameters); 179 Blob inputBlob = blobHolder.getBlob(); 180 String blobPath = blobHolder.getFilePath(); 181 if (inputBlob == null) { 182 return null; 183 } 184 185 OfficeDocumentConverter documentConverter = newDocumentConverter(); 186 // This plugin do deal only with one input source. 187 String sourceMimetype = inputBlob.getMimeType(); 188 189 boolean pdfa1 = parameters != null && Boolean.TRUE.equals(parameters.get(PDFA1_PARAM)); 190 191 File sourceFile = null; 192 File outFile = null; 193 File[] files = null; 194 try { 195 196 // If the input blob has the HTML mime type, make sure the 197 // charset meta is present, add it if not 198 if ("text/html".equals(sourceMimetype)) { 199 inputBlob = checkCharsetMeta(inputBlob); 200 } 201 202 // Get original file extension 203 String ext = inputBlob.getFilename(); 204 int dotPosition = ext.lastIndexOf('.'); 205 if (dotPosition == -1) { 206 ext = ".bin"; 207 } else { 208 ext = ext.substring(dotPosition); 209 } 210 // Copy in a file to be able to read it several time 211 sourceFile = Framework.createTempFile("NXJOOoConverterDocumentIn", ext); 212 InputStream stream = inputBlob.getStream(); 213 FileUtils.copyToFile(stream, sourceFile); 214 stream.close(); 215 216 DocumentFormat sourceFormat = null; 217 if (sourceMimetype != null) { 218 // Try to fetch it from the registry. 219 sourceFormat = getSourceFormat(documentConverter, sourceMimetype); 220 } 221 // If not found in the registry or not given as a parameter. 222 // Try to sniff ! What does that smell ? :) 223 if (sourceFormat == null) { 224 sourceFormat = getSourceFormat(documentConverter, sourceFile); 225 } 226 227 // From plugin settings because we know the destination 228 // mimetype. 229 DocumentFormat destinationFormat = getDestinationFormat(documentConverter, sourceFormat, pdfa1); 230 231 // allow HTML2PDF filtering 232 233 List<Blob> blobs = new ArrayList<>(); 234 235 if (descriptor.getDestinationMimeType().equals("text/html")) { 236 String tmpDirPath = getTmpDirectory(); 237 File myTmpDir = new File(tmpDirPath + "/JODConv_" + System.currentTimeMillis()); 238 boolean created = myTmpDir.mkdir(); 239 if (!created) { 240 throw new IOException("Unable to create temp dir"); 241 } 242 243 outFile = new File(myTmpDir.getAbsolutePath() + "/" + "NXJOOoConverterDocumentOut." 244 + destinationFormat.getExtension()); 245 246 created = outFile.createNewFile(); 247 if (!created) { 248 throw new IOException("Unable to create temp file"); 249 } 250 251 log.debug("Input File = " + outFile.getAbsolutePath()); 252 // Perform the actual conversion. 253 documentConverter.convert(sourceFile, outFile, destinationFormat); 254 255 files = myTmpDir.listFiles(); 256 for (File file : files) { 257 // copy the files to a new tmp location, as we'll delete them 258 Blob blob; 259 try (FileInputStream in = new FileInputStream(file)) { 260 blob = Blobs.createBlob(in); 261 } 262 blob.setFilename(file.getName()); 263 blobs.add(blob); 264 // add a blob for the index 265 if (file.getName().equals(outFile.getName())) { 266 Blob indexBlob; 267 try (FileInputStream in = new FileInputStream(file)) { 268 indexBlob = Blobs.createBlob(in); 269 } 270 indexBlob.setFilename("index.html"); 271 blobs.add(0, indexBlob); 272 } 273 } 274 275 } else { 276 outFile = Framework.createTempFile("NXJOOoConverterDocumentOut", '.' + destinationFormat.getExtension()); 277 278 // Perform the actual conversion. 279 documentConverter.convert(sourceFile, outFile, destinationFormat, parameters); 280 281 Blob blob; 282 try (FileInputStream in = new FileInputStream(outFile)) { 283 blob = Blobs.createBlob(in, getDestinationMimeType()); 284 } 285 blobs.add(blob); 286 } 287 return new SimpleCachableBlobHolder(blobs); 288 } catch (IOException e) { 289 String msg = String.format("An error occurred trying to convert file %s to from %s to %s", blobPath, 290 sourceMimetype, getDestinationMimeType()); 291 throw new ConversionException(msg, e); 292 } finally { 293 if (sourceFile != null) { 294 sourceFile.delete(); 295 } 296 if (outFile != null) { 297 outFile.delete(); 298 } 299 300 if (files != null) { 301 for (File file : files) { 302 if (file.exists()) { 303 file.delete(); 304 } 305 } 306 } 307 } 308 309 } 310 311 protected OfficeDocumentConverter newDocumentConverter() throws ConversionException { 312 OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class); 313 OfficeDocumentConverter documentConverter = oooManagerService.getDocumentConverter(); 314 if (documentConverter == null) { 315 throw new ConversionException("Could not connect to the remote OpenOffice server"); 316 } 317 return documentConverter; 318 } 319 320 @SuppressWarnings("hiding") 321 @Override 322 public void init(ConverterDescriptor descriptor) { 323 this.descriptor = descriptor; 324 } 325 326 @Override 327 public ConverterCheckResult isConverterAvailable() { 328 ConverterCheckResult result = new ConverterCheckResult(); 329 OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class); 330 if (!oooManagerService.isOOoManagerStarted()) { 331 result.setAvailable(false); 332 } 333 return result; 334 } 335 336 protected String getTmpDirectory() { 337 String tmp = null; 338 Map<String, String> parameters = descriptor.getParameters(); 339 if (parameters != null && parameters.containsKey(TMP_PATH_PARAMETER)) { 340 tmp = parameters.get(TMP_PATH_PARAMETER); 341 } 342 if (tmp == null) { 343 tmp = Environment.getDefault().getTemp().getPath(); 344 } 345 return tmp; 346 } 347 348 /** 349 * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it. 350 * 351 * @param inputBlob the input blob 352 * @throws IOException Signals that an I/O exception has occurred. 353 */ 354 protected Blob checkCharsetMeta(Blob inputBlob) throws IOException { 355 356 String charset = inputBlob.getEncoding(); 357 if (!StringUtils.isEmpty(charset)) { 358 Pattern charsetMetaPattern = Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset)); 359 Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString()); 360 if (!charsetMetaMatcher.find()) { 361 String charsetMetaTag = String.format( 362 "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset); 363 StringBuilder sb = new StringBuilder(charsetMetaTag); 364 sb.append(new String(inputBlob.getByteArray(), charset)); 365 Blob blobWithCharsetMetaTag = Blobs.createBlob(sb.toString(), "text/html", charset, 366 inputBlob.getFilename()); 367 return blobWithCharsetMetaTag; 368 } 369 } 370 return inputBlob; 371 } 372}