001/* 002 * (C) Copyright 2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Stephane Lacoin 018 */ 019 020package org.nuxeo.ecm.platform.convert.plugins; 021 022import java.io.BufferedInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.Serializable; 027import java.util.Map; 028 029import org.apache.commons.io.input.ReaderInputStream; 030import org.apache.commons.lang.StringUtils; 031import org.nuxeo.ecm.core.api.Blob; 032import org.nuxeo.ecm.core.api.Blobs; 033import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 035import org.nuxeo.ecm.core.convert.api.ConversionException; 036import org.nuxeo.ecm.core.convert.extension.Converter; 037import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 038 039import com.ibm.icu.text.CharsetDetector; 040import com.ibm.icu.text.CharsetMatch; 041 042public class UTF8CharsetConverter implements Converter { 043 044 private static final String TEXT_PREFIX = "text/"; 045 046 private static final String UTF_8 = "UTF-8"; 047 048 @Override 049 public void init(ConverterDescriptor descriptor) { 050 } 051 052 @Override 053 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 054 Blob originalBlob = blobHolder.getBlob(); 055 String path = blobHolder.getFilePath(); 056 Blob transcodedBlob; 057 try { 058 transcodedBlob = convert(originalBlob); 059 } catch (IOException | ConversionException e) { 060 throw new ConversionException("Cannot transcode " + path + " to UTF-8", e); 061 } 062 return new SimpleBlobHolder(transcodedBlob); 063 } 064 065 protected Blob convert(Blob blob) throws IOException, ConversionException { 066 String mimetype = blob.getMimeType(); 067 if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) { 068 return blob; 069 } 070 String encoding = blob.getEncoding(); 071 if (UTF_8.equals(encoding)) { 072 return blob; 073 } 074 if (StringUtils.isEmpty(encoding)) { 075 try (InputStream in = blob.getStream()) { 076 encoding = detectEncoding(in); 077 } 078 } 079 Blob newBlob; 080 if (UTF_8.equals(encoding)) { 081 // had no encoding previously, detected as UTF-8 082 // just reuse the same blob 083 try (InputStream in = blob.getStream()) { 084 newBlob = Blobs.createBlob(in); 085 } 086 } else { 087 // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8 088 try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) { 089 newBlob = Blobs.createBlob(in); 090 } 091 } 092 newBlob.setMimeType(mimetype); 093 newBlob.setEncoding(UTF_8); 094 newBlob.setFilename(blob.getFilename()); 095 return newBlob; 096 } 097 098 protected String detectEncoding(InputStream in) throws IOException, ConversionException { 099 if (!in.markSupported()) { 100 // detector.setText requires mark 101 in = new BufferedInputStream(in); 102 } 103 CharsetDetector detector = new CharsetDetector(); 104 detector.setText(in); 105 CharsetMatch charsetMatch = detector.detect(); 106 if (charsetMatch == null) { 107 throw new ConversionException("Cannot detect source charset."); 108 } 109 return charsetMatch.getName(); 110 } 111 112}