001/*
002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Olivier Grisel
016 *     Florent Guillaume
017 */
018package org.nuxeo.ecm.platform.filemanager.service.extension;
019
020import java.io.IOException;
021import java.nio.ByteBuffer;
022import java.nio.CharBuffer;
023import java.nio.charset.CharacterCodingException;
024import java.nio.charset.Charset;
025import java.nio.charset.CharsetDecoder;
026import java.nio.charset.CodingErrorAction;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.List;
030
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.nuxeo.ecm.core.api.Blob;
034import org.nuxeo.ecm.core.api.DocumentModel;
035import org.nuxeo.ecm.core.api.NuxeoException;
036
037import com.ibm.icu.text.CharsetDetector;
038import com.ibm.icu.text.CharsetMatch;
039
040/**
041 * Imports the string content of a blob as text for the content of the "note" field of a new Note document.
042 * <p>
043 * If an existing document with the same title is found the existing Note document is updated instead.
044 */
045public class NoteImporter extends AbstractFileImporter {
046
047    private static final Log log = LogFactory.getLog(NoteImporter.class);
048
049    private static final String NOTE_TYPE = "Note";
050
051    private static final String NOTE_SCHEMA = "note";
052
053    private static final String NOTE_FIELD = "note";
054
055    private static final String MT_FIELD = "mime_type";
056
057    private static final long serialVersionUID = 1L;
058
059    @Override
060    public String getDefaultDocType() {
061        return NOTE_TYPE;
062    }
063
064    @Override
065    public boolean isOverwriteByTitle() {
066        return true;
067    }
068
069    @Override
070    public boolean updateDocumentIfPossible(DocumentModel doc, Blob content) {
071        if (!doc.hasSchema(NOTE_SCHEMA)) {
072            log.warn("Schema '" + NOTE_SCHEMA + "' is not available for document " + doc);
073            return false;
074        }
075        return super.updateDocumentIfPossible(doc, content);
076    }
077
078    @Override
079    public void updateDocument(DocumentModel doc, Blob content) {
080        String string;
081        try {
082            string = getString(content);
083        } catch (IOException e) {
084            throw new NuxeoException(e);
085        }
086        doc.setProperty(NOTE_SCHEMA, NOTE_FIELD, string);
087        doc.setProperty(NOTE_SCHEMA, MT_FIELD, content.getMimeType());
088    }
089
090    protected String getString(Blob blob) throws IOException {
091        String s = guessEncoding(blob);
092        if (s == null) {
093            s = blob.getString(); // uses default charset
094        }
095        return s;
096    }
097
098    protected static String guessEncoding(Blob blob) throws IOException {
099        // encoding already known?
100        if (blob.getEncoding() != null) {
101            return null;
102        }
103
104        // bad mime type?
105        String mimeType = blob.getMimeType();
106        if (mimeType == null) {
107            return null;
108        }
109        if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) {
110            // not a text file, we shouldn't be in the Note importer
111            return null;
112        }
113
114        byte[] bytes = blob.getByteArray();
115
116        List<String> charsets = new ArrayList<>(Arrays.asList("utf-8", "iso-8859-1"));
117
118        String CSEQ = "charset=";
119        int i = mimeType.indexOf(CSEQ);
120        if (i > 0) {
121            // charset specified in MIME type
122            String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim();
123            blob.setMimeType(onlyMimeType);
124            String charset = mimeType.substring(i + CSEQ.length());
125            i = charset.indexOf(";");
126            if (i > 0) {
127                charset = charset.substring(0, i);
128            }
129            charset = charset.trim().replace("\"", "");
130            charsets.add(0, charset);
131        } else {
132            // charset detected from the actual bytes
133            CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
134            if (charsetMatch != null) {
135                String charset = charsetMatch.getName();
136                charsets.add(0, charset);
137            }
138        }
139
140        // now convert the string according to the charset, and fallback on others if not possible
141        for (String charset : charsets) {
142            try {
143                Charset cs = Charset.forName(charset);
144                CharsetDecoder d = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(
145                        CodingErrorAction.REPORT);
146                CharBuffer cb = d.decode(ByteBuffer.wrap(bytes));
147                if (cb.length() != 0 && cb.charAt(0) == '\ufeff') {
148                    // remove BOM
149                    cb = cb.subSequence(1, cb.length());
150                }
151                return cb.toString();
152            } catch (IllegalArgumentException e) {
153                // illegal charset
154            } catch (CharacterCodingException e) {
155                // could not decode
156            }
157        }
158        // nothing worked, use platform
159        return null;
160    }
161
162}