001/* 002 * (C) Copyright 2013-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Stephane Lacoin 018 */ 019 020package org.nuxeo.ecm.platform.convert.plugins; 021 022import java.io.BufferedInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.Serializable; 027import java.util.Map; 028import java.util.Optional; 029 030import org.apache.commons.io.input.ReaderInputStream; 031import org.apache.commons.lang3.StringUtils; 032import org.nuxeo.ecm.core.api.Blob; 033import org.nuxeo.ecm.core.api.Blobs; 034import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 035import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 036import org.nuxeo.ecm.core.convert.api.ConversionException; 037import org.nuxeo.ecm.core.convert.extension.Converter; 038import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 039 040import com.ibm.icu.text.CharsetDetector; 041import com.ibm.icu.text.CharsetMatch; 042 043public class UTF8CharsetConverter implements Converter { 044 045 private static final String TEXT_PREFIX = "text/"; 046 047 private static final String UTF_8 = "UTF-8"; 048 049 @Override 050 public void init(ConverterDescriptor descriptor) { 051 } 052 053 @Override 054 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 055 Blob originalBlob = blobHolder.getBlob(); 056 String path = blobHolder.getFilePath(); 057 Blob transcodedBlob; 058 try { 059 transcodedBlob = convert(originalBlob); 060 } catch (IOException | ConversionException e) { 061 throw new ConversionException("Cannot transcode " + path + " to UTF-8", blobHolder, e); 062 } 063 return new SimpleBlobHolder(transcodedBlob); 064 } 065 066 protected Blob convert(Blob blob) throws IOException, ConversionException { 067 String mimetype = blob.getMimeType(); 068 if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) { 069 return blob; 070 } 071 String encoding = blob.getEncoding(); 072 if (UTF_8.equals(encoding)) { 073 return blob; 074 } 075 if (StringUtils.isEmpty(encoding)) { 076 encoding = detectEncoding(blob); 077 } 078 Blob newBlob; 079 if (UTF_8.equals(encoding)) { 080 // had no encoding previously, detected as UTF-8 081 // just reuse the same blob 082 try (InputStream in = blob.getStream()) { 083 newBlob = Blobs.createBlob(in); 084 } 085 } else { 086 // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8 087 try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) { 088 newBlob = Blobs.createBlob(in); 089 } 090 } 091 newBlob.setMimeType(mimetype); 092 newBlob.setEncoding(UTF_8); 093 newBlob.setFilename(blob.getFilename()); 094 return newBlob; 095 } 096 097 /** 098 * @deprecated since 11.1. Use {@link #detectEncoding(Blob)} instead. 099 */ 100 @Deprecated 101 protected String detectEncoding(InputStream in) throws IOException, ConversionException { 102 return getEncoding(in).orElseThrow(() -> new ConversionException("Cannot detect source charset.")); 103 } 104 105 protected String detectEncoding(Blob blob) throws IOException, ConversionException { 106 try (InputStream stream = blob.getStream()) { 107 return getEncoding(stream).orElseThrow( 108 () -> new ConversionException("Cannot detect source charset.", blob)); 109 } 110 } 111 112 /** 113 * The private accessor is used to avoid the case when the caller don't close the stream. This method can be merged 114 * with {@link #detectEncoding(Blob)} once the {@link #detectEncoding(InputStream)} is removed. 115 */ 116 private Optional<String> getEncoding(InputStream in) throws IOException { 117 InputStream inputStream = in; 118 if (!inputStream.markSupported()) { 119 // detector.setText requires mark 120 inputStream = new BufferedInputStream(inputStream); 121 } 122 CharsetDetector detector = new CharsetDetector(); 123 detector.setText(inputStream); 124 CharsetMatch charsetMatch = detector.detect(); 125 126 return Optional.ofNullable(charsetMatch).map(CharsetMatch::getName); 127 } 128}