001/*
002 * (C) Copyright 2013 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl-2.1.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Stephane Lacoin
016 */
017
018package org.nuxeo.ecm.platform.convert.plugins;
019
020import java.io.BufferedInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.InputStreamReader;
024import java.io.Serializable;
025import java.util.Map;
026
027import org.apache.commons.io.input.ReaderInputStream;
028import org.apache.commons.lang.StringUtils;
029import org.nuxeo.ecm.core.api.Blob;
030import org.nuxeo.ecm.core.api.Blobs;
031import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
032import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
033import org.nuxeo.ecm.core.convert.api.ConversionException;
034import org.nuxeo.ecm.core.convert.extension.Converter;
035import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
036
037import com.ibm.icu.text.CharsetDetector;
038import com.ibm.icu.text.CharsetMatch;
039
040public class UTF8CharsetConverter implements Converter {
041
042    private static final String TEXT_PREFIX = "text/";
043
044    private static final String UTF_8 = "UTF-8";
045
046    @Override
047    public void init(ConverterDescriptor descriptor) {
048    }
049
050    @Override
051    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
052        Blob originalBlob = blobHolder.getBlob();
053        String path = blobHolder.getFilePath();
054        Blob transcodedBlob;
055        try {
056            transcodedBlob = convert(originalBlob);
057        } catch (IOException | ConversionException e) {
058            throw new ConversionException("Cannot transcode " + path + " to UTF-8", e);
059        }
060        return new SimpleBlobHolder(transcodedBlob);
061    }
062
063    protected Blob convert(Blob blob) throws IOException, ConversionException {
064        String mimetype = blob.getMimeType();
065        if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) {
066            return blob;
067        }
068        String encoding = blob.getEncoding();
069        if (UTF_8.equals(encoding)) {
070            return blob;
071        }
072        if (StringUtils.isEmpty(encoding)) {
073            try (InputStream in = blob.getStream()) {
074                encoding = detectEncoding(in);
075            }
076        }
077        Blob newBlob;
078        if (UTF_8.equals(encoding)) {
079            // had no encoding previously, detected as UTF-8
080            // just reuse the same blob
081            try (InputStream in = blob.getStream()) {
082                newBlob = Blobs.createBlob(in);
083            }
084        } else {
085            // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8
086            try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) {
087                newBlob = Blobs.createBlob(in);
088            }
089        }
090        newBlob.setMimeType(mimetype);
091        newBlob.setEncoding(UTF_8);
092        newBlob.setFilename(blob.getFilename());
093        return newBlob;
094    }
095
096    protected String detectEncoding(InputStream in) throws IOException, ConversionException {
097        if (!in.markSupported()) {
098            // detector.setText requires mark
099            in = new BufferedInputStream(in);
100        }
101        CharsetDetector detector = new CharsetDetector();
102        detector.setText(in);
103        CharsetMatch charsetMatch = detector.detect();
104        if (charsetMatch == null) {
105            throw new ConversionException("Cannot detect source charset.");
106        }
107        return charsetMatch.getName();
108    }
109
110}