001/*
002 * (C) Copyright 2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Stephane Lacoin
018 */
019
020package org.nuxeo.ecm.platform.convert.plugins;
021
022import java.io.BufferedInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.io.Serializable;
027import java.util.Map;
028
029import org.apache.commons.io.input.ReaderInputStream;
030import org.apache.commons.lang.StringUtils;
031import org.nuxeo.ecm.core.api.Blob;
032import org.nuxeo.ecm.core.api.Blobs;
033import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
035import org.nuxeo.ecm.core.convert.api.ConversionException;
036import org.nuxeo.ecm.core.convert.extension.Converter;
037import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
038
039import com.ibm.icu.text.CharsetDetector;
040import com.ibm.icu.text.CharsetMatch;
041
042public class UTF8CharsetConverter implements Converter {
043
044    private static final String TEXT_PREFIX = "text/";
045
046    private static final String UTF_8 = "UTF-8";
047
048    @Override
049    public void init(ConverterDescriptor descriptor) {
050    }
051
052    @Override
053    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
054        Blob originalBlob = blobHolder.getBlob();
055        String path = blobHolder.getFilePath();
056        Blob transcodedBlob;
057        try {
058            transcodedBlob = convert(originalBlob);
059        } catch (IOException | ConversionException e) {
060            throw new ConversionException("Cannot transcode " + path + " to UTF-8", e);
061        }
062        return new SimpleBlobHolder(transcodedBlob);
063    }
064
065    protected Blob convert(Blob blob) throws IOException, ConversionException {
066        String mimetype = blob.getMimeType();
067        if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) {
068            return blob;
069        }
070        String encoding = blob.getEncoding();
071        if (UTF_8.equals(encoding)) {
072            return blob;
073        }
074        if (StringUtils.isEmpty(encoding)) {
075            try (InputStream in = blob.getStream()) {
076                encoding = detectEncoding(in);
077            }
078        }
079        Blob newBlob;
080        if (UTF_8.equals(encoding)) {
081            // had no encoding previously, detected as UTF-8
082            // just reuse the same blob
083            try (InputStream in = blob.getStream()) {
084                newBlob = Blobs.createBlob(in);
085            }
086        } else {
087            // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8
088            try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) {
089                newBlob = Blobs.createBlob(in);
090            }
091        }
092        newBlob.setMimeType(mimetype);
093        newBlob.setEncoding(UTF_8);
094        newBlob.setFilename(blob.getFilename());
095        return newBlob;
096    }
097
098    protected String detectEncoding(InputStream in) throws IOException, ConversionException {
099        if (!in.markSupported()) {
100            // detector.setText requires mark
101            in = new BufferedInputStream(in);
102        }
103        CharsetDetector detector = new CharsetDetector();
104        detector.setText(in);
105        CharsetMatch charsetMatch = detector.detect();
106        if (charsetMatch == null) {
107            throw new ConversionException("Cannot detect source charset.");
108        }
109        return charsetMatch.getName();
110    }
111
112}