001/* 002 * (C) Copyright 2013 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl-2.1.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Stephane Lacoin 016 */ 017 018package org.nuxeo.ecm.platform.convert.plugins; 019 020import java.io.BufferedInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.InputStreamReader; 024import java.io.Serializable; 025import java.util.Map; 026 027import org.apache.commons.io.input.ReaderInputStream; 028import org.apache.commons.lang.StringUtils; 029import org.nuxeo.ecm.core.api.Blob; 030import org.nuxeo.ecm.core.api.Blobs; 031import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 032import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 033import org.nuxeo.ecm.core.convert.api.ConversionException; 034import org.nuxeo.ecm.core.convert.extension.Converter; 035import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 036 037import com.ibm.icu.text.CharsetDetector; 038import com.ibm.icu.text.CharsetMatch; 039 040public class UTF8CharsetConverter implements Converter { 041 042 private static final String TEXT_PREFIX = "text/"; 043 044 private static final String UTF_8 = "UTF-8"; 045 046 @Override 047 public void init(ConverterDescriptor descriptor) { 048 } 049 050 @Override 051 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 052 Blob originalBlob = blobHolder.getBlob(); 053 String path = blobHolder.getFilePath(); 054 Blob transcodedBlob; 055 try { 056 transcodedBlob = convert(originalBlob); 057 } catch (IOException | ConversionException e) { 058 throw new ConversionException("Cannot transcode " + path + " to UTF-8", e); 059 } 060 return new SimpleBlobHolder(transcodedBlob); 061 } 062 063 protected Blob convert(Blob blob) throws IOException, ConversionException { 064 String mimetype = blob.getMimeType(); 065 if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) { 066 return blob; 067 } 068 String encoding = blob.getEncoding(); 069 if (UTF_8.equals(encoding)) { 070 return blob; 071 } 072 if (StringUtils.isEmpty(encoding)) { 073 try (InputStream in = blob.getStream()) { 074 encoding = detectEncoding(in); 075 } 076 } 077 Blob newBlob; 078 if (UTF_8.equals(encoding)) { 079 // had no encoding previously, detected as UTF-8 080 // just reuse the same blob 081 try (InputStream in = blob.getStream()) { 082 newBlob = Blobs.createBlob(in); 083 } 084 } else { 085 // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8 086 try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) { 087 newBlob = Blobs.createBlob(in); 088 } 089 } 090 newBlob.setMimeType(mimetype); 091 newBlob.setEncoding(UTF_8); 092 newBlob.setFilename(blob.getFilename()); 093 return newBlob; 094 } 095 096 protected String detectEncoding(InputStream in) throws IOException, ConversionException { 097 if (!in.markSupported()) { 098 // detector.setText requires mark 099 in = new BufferedInputStream(in); 100 } 101 CharsetDetector detector = new CharsetDetector(); 102 detector.setText(in); 103 CharsetMatch charsetMatch = detector.detect(); 104 if (charsetMatch == null) { 105 throw new ConversionException("Cannot detect source charset."); 106 } 107 return charsetMatch.getName(); 108 } 109 110}