001/* 002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Florent Guillaume 017 * Thierry Delprat 018 */ 019package org.nuxeo.ecm.platform.convert.plugins; 020 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.Serializable; 026import java.util.ArrayList; 027import java.util.HashMap; 028import java.util.List; 029import java.util.Map; 030import java.util.regex.Matcher; 031import java.util.regex.Pattern; 032 033import org.apache.commons.lang.StringUtils; 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036import org.artofsolving.jodconverter.OfficeDocumentConverter; 037import org.artofsolving.jodconverter.StandardConversionTask; 038import org.artofsolving.jodconverter.document.DocumentFamily; 039import org.artofsolving.jodconverter.document.DocumentFormat; 040 041import org.nuxeo.common.utils.FileUtils; 042import org.nuxeo.ecm.core.api.Blob; 043import org.nuxeo.ecm.core.api.Blobs; 044import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 045import org.nuxeo.ecm.core.convert.api.ConversionException; 046import org.nuxeo.ecm.core.convert.api.ConverterCheckResult; 047import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 048import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 049import org.nuxeo.ecm.core.convert.extension.ExternalConverter; 050import org.nuxeo.ecm.platform.convert.ooomanager.OOoManagerService; 051import org.nuxeo.ecm.platform.mimetype.interfaces.MimetypeRegistry; 052import org.nuxeo.runtime.api.Framework; 053 054/** 055 * Converter based on JOD which uses an external OpenOffice process to do actual conversions. 056 */ 057public class JODBasedConverter implements ExternalConverter { 058 059 protected static final String TMP_PATH_PARAMETER = "TmpDirectory"; 060 061 private static final Log log = LogFactory.getLog(JODBasedConverter.class); 062 063 /** 064 * Boolean conversion parameter for PDF/A-1. 065 * 066 * @since 5.6 067 */ 068 public static final String PDFA1_PARAM = "PDF/A-1"; 069 070 /** 071 * Boolean parameter to force update of the document TOC 072 * 073 * @since 5.6 074 */ 075 public static final String UPDATE_INDEX_PARAM = StandardConversionTask.UPDATE_DOCUMENT_INDEX; 076 077 protected static final Map<DocumentFamily, String> PDF_FILTER_NAMES = new HashMap<DocumentFamily, String>(); 078 { 079 PDF_FILTER_NAMES.put(DocumentFamily.TEXT, "writer_pdf_Export"); 080 PDF_FILTER_NAMES.put(DocumentFamily.SPREADSHEET, "calc_pdf_Export"); 081 PDF_FILTER_NAMES.put(DocumentFamily.PRESENTATION, "impress_pdf_Export"); 082 PDF_FILTER_NAMES.put(DocumentFamily.DRAWING, "draw_pdf_Export"); 083 } 084 085 protected ConverterDescriptor descriptor; 086 087 protected String getDestinationMimeType() { 088 return descriptor.getDestinationMimeType(); 089 } 090 091 /** 092 * Returns the destination format for the given plugin. 093 * <p> 094 * It takes the actual destination mimetype from the plugin configuration. 095 * 096 * @param sourceFormat the source format 097 * @param pdfa1 true if PDF/A-1 is required 098 */ 099 protected DocumentFormat getDestinationFormat(OfficeDocumentConverter documentConverter, 100 DocumentFormat sourceFormat, boolean pdfa1) { 101 String mimeType = getDestinationMimeType(); 102 DocumentFormat destinationFormat = documentConverter.getFormatRegistry().getFormatByMediaType(mimeType); 103 if ("application/pdf".equals(mimeType)) { 104 destinationFormat = extendPDFFormat(sourceFormat, destinationFormat, pdfa1); 105 } 106 return destinationFormat; 107 } 108 109 protected DocumentFormat extendPDFFormat(DocumentFormat sourceFormat, DocumentFormat defaultFormat, boolean pdfa1) { 110 DocumentFamily sourceFamily = sourceFormat.getInputFamily(); 111 String sourceMediaType = sourceFormat.getMediaType(); 112 DocumentFormat pdfFormat = new DocumentFormat(pdfa1 ? "PDF/A-1" : "PDF", "pdf", "application/pdf"); 113 Map<DocumentFamily, Map<String, ?>> storePropertiesByFamily = new HashMap<DocumentFamily, Map<String, ?>>(); 114 Map<DocumentFamily, Map<String, ?>> defaultStorePropertiesByFamily = defaultFormat.getStorePropertiesByFamily(); 115 for (DocumentFamily family : defaultStorePropertiesByFamily.keySet()) { 116 if (family.equals(sourceFamily)) { 117 continue; 118 } 119 storePropertiesByFamily.put(family, defaultStorePropertiesByFamily.get(family)); 120 } 121 storePropertiesByFamily.put(sourceFamily, 122 extendPDFStoreProperties(sourceMediaType, pdfa1, defaultStorePropertiesByFamily.get(sourceFamily))); 123 pdfFormat.setStorePropertiesByFamily(storePropertiesByFamily); 124 return pdfFormat; 125 } 126 127 protected Map<String, Object> extendPDFStoreProperties(String mediatype, boolean pdfa1, 128 Map<String, ?> originalProperties) { 129 Map<String, Object> extendedProperties = new HashMap<String, Object>(); 130 for (Map.Entry<String, ?> entry : originalProperties.entrySet()) { 131 extendedProperties.put(entry.getKey(), entry.getValue()); 132 } 133 if ("text/html".equals(mediatype)) { 134 extendedProperties.put("FilterName", "writer_web_pdf_Export"); 135 } 136 if (pdfa1) { 137 Map<String, Object> filterData = new HashMap<String, Object>(); 138 filterData.put("SelectPdfVersion", Integer.valueOf(1)); // PDF/A-1 139 filterData.put("UseTaggedPDF", Boolean.TRUE); // per spec 140 extendedProperties.put("FilterData", filterData); 141 } 142 return extendedProperties; 143 } 144 145 /** 146 * Returns the format for the file passed as a parameter. 147 * <p> 148 * We will ask the mimetype registry service to sniff its mimetype. 149 * 150 * @return DocumentFormat for the given file 151 */ 152 private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, File file) { 153 MimetypeRegistry mimetypeRegistry = Framework.getService(MimetypeRegistry.class); 154 String mimetypeStr = mimetypeRegistry.getMimetypeFromFile(file); 155 DocumentFormat format = documentConverter.getFormatRegistry().getFormatByMediaType(mimetypeStr); 156 return format; 157 } 158 159 /** 160 * Returns the DocumentFormat for the given mimetype. 161 * 162 * @return DocumentFormat for the given mimetype 163 */ 164 private static DocumentFormat getSourceFormat(OfficeDocumentConverter documentConverter, String mimetype) { 165 return documentConverter.getFormatRegistry().getFormatByMediaType(mimetype); 166 } 167 168 @Override 169 protected void finalize() throws Throwable { 170 super.finalize(); 171 } 172 173 @Override 174 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 175 blobHolder = new UTF8CharsetConverter().convert(blobHolder, parameters); 176 Blob inputBlob = blobHolder.getBlob(); 177 String blobPath = blobHolder.getFilePath(); 178 if (inputBlob == null) { 179 return null; 180 } 181 182 OfficeDocumentConverter documentConverter = newDocumentConverter(); 183 // This plugin do deal only with one input source. 184 String sourceMimetype = inputBlob.getMimeType(); 185 186 boolean pdfa1 = parameters != null && Boolean.TRUE.equals(parameters.get(PDFA1_PARAM)); 187 188 File sourceFile = null; 189 File outFile = null; 190 File[] files = null; 191 try { 192 193 // If the input blob has the HTML mime type, make sure the 194 // charset meta is present, add it if not 195 if ("text/html".equals(sourceMimetype)) { 196 inputBlob = checkCharsetMeta(inputBlob); 197 } 198 199 // Get original file extension 200 String ext = inputBlob.getFilename(); 201 int dotPosition = ext.lastIndexOf('.'); 202 if (dotPosition == -1) { 203 ext = ".bin"; 204 } else { 205 ext = ext.substring(dotPosition); 206 } 207 // Copy in a file to be able to read it several time 208 sourceFile = File.createTempFile("NXJOOoConverterDocumentIn", ext); 209 InputStream stream = inputBlob.getStream(); 210 FileUtils.copyToFile(stream, sourceFile); 211 stream.close(); 212 213 DocumentFormat sourceFormat = null; 214 if (sourceMimetype != null) { 215 // Try to fetch it from the registry. 216 sourceFormat = getSourceFormat(documentConverter, sourceMimetype); 217 } 218 // If not found in the registry or not given as a parameter. 219 // Try to sniff ! What does that smell ? :) 220 if (sourceFormat == null) { 221 sourceFormat = getSourceFormat(documentConverter, sourceFile); 222 } 223 224 // From plugin settings because we know the destination 225 // mimetype. 226 DocumentFormat destinationFormat = getDestinationFormat(documentConverter, sourceFormat, pdfa1); 227 228 // allow HTML2PDF filtering 229 230 List<Blob> blobs = new ArrayList<Blob>(); 231 232 if (descriptor.getDestinationMimeType().equals("text/html")) { 233 String tmpDirPath = getTmpDirectory(); 234 File myTmpDir = new File(tmpDirPath + "/JODConv_" + System.currentTimeMillis()); 235 boolean created = myTmpDir.mkdir(); 236 if (!created) { 237 throw new IOException("Unable to create temp dir"); 238 } 239 240 outFile = new File(myTmpDir.getAbsolutePath() + "/" + "NXJOOoConverterDocumentOut." 241 + destinationFormat.getExtension()); 242 243 created = outFile.createNewFile(); 244 if (!created) { 245 throw new IOException("Unable to create temp file"); 246 } 247 248 log.debug("Input File = " + outFile.getAbsolutePath()); 249 // Perform the actual conversion. 250 documentConverter.convert(sourceFile, outFile, destinationFormat); 251 252 files = myTmpDir.listFiles(); 253 for (File file : files) { 254 // copy the files to a new tmp location, as we'll delete them 255 Blob blob; 256 try (FileInputStream in = new FileInputStream(file)) { 257 blob = Blobs.createBlob(in); 258 } 259 blob.setFilename(file.getName()); 260 blobs.add(blob); 261 // add a blob for the index 262 if (file.getName().equals(outFile.getName())) { 263 Blob indexBlob; 264 try (FileInputStream in = new FileInputStream(file)) { 265 indexBlob = Blobs.createBlob(in); 266 } 267 indexBlob.setFilename("index.html"); 268 blobs.add(0, indexBlob); 269 } 270 } 271 272 } else { 273 outFile = File.createTempFile("NXJOOoConverterDocumentOut", '.' + destinationFormat.getExtension()); 274 275 // Perform the actual conversion. 276 documentConverter.convert(sourceFile, outFile, destinationFormat, parameters); 277 278 Blob blob; 279 try (FileInputStream in = new FileInputStream(outFile)) { 280 blob = Blobs.createBlob(in, getDestinationMimeType()); 281 } 282 blobs.add(blob); 283 } 284 return new SimpleCachableBlobHolder(blobs); 285 } catch (IOException e) { 286 String msg = String.format("An error occurred trying to convert file %s to from %s to %s", blobPath, 287 sourceMimetype, getDestinationMimeType()); 288 throw new ConversionException(msg, e); 289 } finally { 290 if (sourceFile != null) { 291 sourceFile.delete(); 292 } 293 if (outFile != null) { 294 outFile.delete(); 295 } 296 297 if (files != null) { 298 for (File file : files) { 299 if (file.exists()) { 300 file.delete(); 301 } 302 } 303 } 304 } 305 306 } 307 308 protected OfficeDocumentConverter newDocumentConverter() throws ConversionException { 309 OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class); 310 OfficeDocumentConverter documentConverter = oooManagerService.getDocumentConverter(); 311 if (documentConverter == null) { 312 throw new ConversionException("Could not connect to the remote OpenOffice server"); 313 } 314 return documentConverter; 315 } 316 317 @Override 318 public void init(ConverterDescriptor descriptor) { 319 this.descriptor = descriptor; 320 } 321 322 @Override 323 public ConverterCheckResult isConverterAvailable() { 324 ConverterCheckResult result = new ConverterCheckResult(); 325 OOoManagerService oooManagerService = Framework.getService(OOoManagerService.class); 326 if (!oooManagerService.isOOoManagerStarted()) { 327 result.setAvailable(false); 328 } 329 return result; 330 } 331 332 protected String getTmpDirectory() { 333 String tmp = null; 334 Map<String, String> parameters = descriptor.getParameters(); 335 if (parameters != null && parameters.containsKey(TMP_PATH_PARAMETER)) { 336 tmp = parameters.get(TMP_PATH_PARAMETER); 337 } 338 if (tmp == null) { 339 tmp = System.getProperty("java.io.tmpdir"); 340 } 341 return tmp; 342 } 343 344 /** 345 * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it. 346 * 347 * @param inputBlob the input blob 348 * @throws IOException Signals that an I/O exception has occurred. 349 */ 350 protected Blob checkCharsetMeta(Blob inputBlob) throws IOException { 351 352 String charset = inputBlob.getEncoding(); 353 if (!StringUtils.isEmpty(charset)) { 354 Pattern charsetMetaPattern = Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset)); 355 Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString()); 356 if (!charsetMetaMatcher.find()) { 357 String charsetMetaTag = String.format( 358 "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset); 359 StringBuilder sb = new StringBuilder(charsetMetaTag); 360 sb.append(new String(inputBlob.getByteArray(), charset)); 361 Blob blobWithCharsetMetaTag = Blobs.createBlob(sb.toString(), "text/html", charset, 362 inputBlob.getFilename()); 363 return blobWithCharsetMetaTag; 364 } 365 } 366 return inputBlob; 367 } 368}