001/*
002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Olivier Grisel
018 *     Florent Guillaume
019 */
020package org.nuxeo.ecm.platform.filemanager.service.extension;
021
022import java.io.IOException;
023import java.nio.ByteBuffer;
024import java.nio.CharBuffer;
025import java.nio.charset.CharacterCodingException;
026import java.nio.charset.Charset;
027import java.nio.charset.CharsetDecoder;
028import java.nio.charset.CodingErrorAction;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.List;
032
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.nuxeo.ecm.core.api.Blob;
036import org.nuxeo.ecm.core.api.DocumentModel;
037import org.nuxeo.ecm.core.api.NuxeoException;
038
039import com.ibm.icu.text.CharsetDetector;
040import com.ibm.icu.text.CharsetMatch;
041
042/**
043 * Imports the string content of a blob as text for the content of the "note" field of a new Note document.
044 * <p>
045 * If an existing document with the same title is found the existing Note document is updated instead.
046 */
047public class NoteImporter extends AbstractFileImporter {
048
049    private static final Log log = LogFactory.getLog(NoteImporter.class);
050
051    private static final String NOTE_TYPE = "Note";
052
053    private static final String NOTE_SCHEMA = "note";
054
055    private static final String NOTE_FIELD = "note";
056
057    private static final String MT_FIELD = "mime_type";
058
059    private static final long serialVersionUID = 1L;
060
061    @Override
062    public String getDefaultDocType() {
063        return NOTE_TYPE;
064    }
065
066    @Override
067    public boolean isOverwriteByTitle() {
068        return true;
069    }
070
071    @Override
072    public boolean updateDocumentIfPossible(DocumentModel doc, Blob content) {
073        if (!doc.hasSchema(NOTE_SCHEMA)) {
074            log.warn("Schema '" + NOTE_SCHEMA + "' is not available for document " + doc);
075            return false;
076        }
077        return super.updateDocumentIfPossible(doc, content);
078    }
079
080    @Override
081    public void updateDocument(DocumentModel doc, Blob content) {
082        String string;
083        try {
084            string = getString(content);
085        } catch (IOException e) {
086            throw new NuxeoException(e);
087        }
088        doc.setProperty(NOTE_SCHEMA, NOTE_FIELD, string);
089        doc.setProperty(NOTE_SCHEMA, MT_FIELD, content.getMimeType());
090    }
091
092    protected String getString(Blob blob) throws IOException {
093        String s = guessEncoding(blob);
094        if (s == null) {
095            s = blob.getString(); // uses default charset
096        }
097        return s;
098    }
099
100    protected static String guessEncoding(Blob blob) throws IOException {
101        // encoding already known?
102        if (blob.getEncoding() != null) {
103            return null;
104        }
105
106        // bad mime type?
107        String mimeType = blob.getMimeType();
108        if (mimeType == null) {
109            return null;
110        }
111        if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) {
112            // not a text file, we shouldn't be in the Note importer
113            return null;
114        }
115
116        byte[] bytes = blob.getByteArray();
117
118        List<String> charsets = new ArrayList<>(Arrays.asList("utf-8", "iso-8859-1"));
119
120        String CSEQ = "charset=";
121        int i = mimeType.indexOf(CSEQ);
122        if (i > 0) {
123            // charset specified in MIME type
124            String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim();
125            blob.setMimeType(onlyMimeType);
126            String charset = mimeType.substring(i + CSEQ.length());
127            i = charset.indexOf(";");
128            if (i > 0) {
129                charset = charset.substring(0, i);
130            }
131            charset = charset.trim().replace("\"", "");
132            charsets.add(0, charset);
133        } else {
134            // charset detected from the actual bytes
135            CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
136            if (charsetMatch != null) {
137                String charset = charsetMatch.getName();
138                charsets.add(0, charset);
139            }
140        }
141
142        // now convert the string according to the charset, and fallback on others if not possible
143        for (String charset : charsets) {
144            try {
145                Charset cs = Charset.forName(charset);
146                CharsetDecoder d = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(
147                        CodingErrorAction.REPORT);
148                CharBuffer cb = d.decode(ByteBuffer.wrap(bytes));
149                if (cb.length() != 0 && cb.charAt(0) == '\ufeff') {
150                    // remove BOM
151                    cb = cb.subSequence(1, cb.length());
152                }
153                return cb.toString();
154            } catch (IllegalArgumentException e) {
155                // illegal charset
156            } catch (CharacterCodingException e) {
157                // could not decode
158            }
159        }
160        // nothing worked, use platform
161        return null;
162    }
163
164}