001/*
002 * (C) Copyright 2013-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Stephane Lacoin
018 */
019
020package org.nuxeo.ecm.platform.convert.plugins;
021
022import java.io.BufferedInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.io.Serializable;
027import java.util.Map;
028import java.util.Optional;
029
030import org.apache.commons.io.input.ReaderInputStream;
031import org.apache.commons.lang3.StringUtils;
032import org.nuxeo.ecm.core.api.Blob;
033import org.nuxeo.ecm.core.api.Blobs;
034import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
035import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
036import org.nuxeo.ecm.core.convert.api.ConversionException;
037import org.nuxeo.ecm.core.convert.extension.Converter;
038import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
039
040import com.ibm.icu.text.CharsetDetector;
041import com.ibm.icu.text.CharsetMatch;
042
043public class UTF8CharsetConverter implements Converter {
044
045    private static final String TEXT_PREFIX = "text/";
046
047    private static final String UTF_8 = "UTF-8";
048
049    @Override
050    public void init(ConverterDescriptor descriptor) {
051    }
052
053    @Override
054    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
055        Blob originalBlob = blobHolder.getBlob();
056        String path = blobHolder.getFilePath();
057        Blob transcodedBlob;
058        try {
059            transcodedBlob = convert(originalBlob);
060        } catch (IOException | ConversionException e) {
061            throw new ConversionException("Cannot transcode " + path + " to UTF-8", blobHolder, e);
062        }
063        return new SimpleBlobHolder(transcodedBlob);
064    }
065
066    protected Blob convert(Blob blob) throws IOException, ConversionException {
067        String mimetype = blob.getMimeType();
068        if (mimetype == null || !mimetype.startsWith(TEXT_PREFIX)) {
069            return blob;
070        }
071        String encoding = blob.getEncoding();
072        if (UTF_8.equals(encoding)) {
073            return blob;
074        }
075        if (StringUtils.isEmpty(encoding)) {
076            encoding = detectEncoding(blob);
077        }
078        Blob newBlob;
079        if (UTF_8.equals(encoding)) {
080            // had no encoding previously, detected as UTF-8
081            // just reuse the same blob
082            try (InputStream in = blob.getStream()) {
083                newBlob = Blobs.createBlob(in);
084            }
085        } else {
086            // decode bytes as chars in the detected charset then encode chars as bytes in UTF-8
087            try (InputStream in = new ReaderInputStream(new InputStreamReader(blob.getStream(), encoding), UTF_8)) {
088                newBlob = Blobs.createBlob(in);
089            }
090        }
091        newBlob.setMimeType(mimetype);
092        newBlob.setEncoding(UTF_8);
093        newBlob.setFilename(blob.getFilename());
094        return newBlob;
095    }
096
097    /**
098     * @deprecated since 11.1. Use {@link #detectEncoding(Blob)} instead.
099     */
100    @Deprecated
101    protected String detectEncoding(InputStream in) throws IOException, ConversionException {
102        return getEncoding(in).orElseThrow(() -> new ConversionException("Cannot detect source charset."));
103    }
104
105    protected String detectEncoding(Blob blob) throws IOException, ConversionException {
106        try (InputStream stream = blob.getStream()) {
107            return getEncoding(stream).orElseThrow(
108                    () -> new ConversionException("Cannot detect source charset.", blob));
109        }
110    }
111
112    /**
113     * The private accessor is used to avoid the case when the caller don't close the stream. This method can be merged
114     * with {@link #detectEncoding(Blob)} once the {@link #detectEncoding(InputStream)} is removed.
115     */
116    private Optional<String> getEncoding(InputStream in) throws IOException {
117        InputStream inputStream = in;
118        if (!inputStream.markSupported()) {
119            // detector.setText requires mark
120            inputStream = new BufferedInputStream(inputStream);
121        }
122        CharsetDetector detector = new CharsetDetector();
123        detector.setText(inputStream);
124        CharsetMatch charsetMatch = detector.detect();
125
126        return Optional.ofNullable(charsetMatch).map(CharsetMatch::getName);
127    }
128}