001/* 002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Olivier Grisel 016 * Florent Guillaume 017 */ 018package org.nuxeo.ecm.platform.filemanager.service.extension; 019 020import java.io.IOException; 021import java.nio.ByteBuffer; 022import java.nio.CharBuffer; 023import java.nio.charset.CharacterCodingException; 024import java.nio.charset.Charset; 025import java.nio.charset.CharsetDecoder; 026import java.nio.charset.CodingErrorAction; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.List; 030 031import org.apache.commons.logging.Log; 032import org.apache.commons.logging.LogFactory; 033import org.nuxeo.ecm.core.api.Blob; 034import org.nuxeo.ecm.core.api.DocumentModel; 035import org.nuxeo.ecm.core.api.NuxeoException; 036 037import com.ibm.icu.text.CharsetDetector; 038import com.ibm.icu.text.CharsetMatch; 039 040/** 041 * Imports the string content of a blob as text for the content of the "note" field of a new Note document. 042 * <p> 043 * If an existing document with the same title is found the existing Note document is updated instead. 044 */ 045public class NoteImporter extends AbstractFileImporter { 046 047 private static final Log log = LogFactory.getLog(NoteImporter.class); 048 049 private static final String NOTE_TYPE = "Note"; 050 051 private static final String NOTE_SCHEMA = "note"; 052 053 private static final String NOTE_FIELD = "note"; 054 055 private static final String MT_FIELD = "mime_type"; 056 057 private static final long serialVersionUID = 1L; 058 059 @Override 060 public String getDefaultDocType() { 061 return NOTE_TYPE; 062 } 063 064 @Override 065 public boolean isOverwriteByTitle() { 066 return true; 067 } 068 069 @Override 070 public boolean updateDocumentIfPossible(DocumentModel doc, Blob content) { 071 if (!doc.hasSchema(NOTE_SCHEMA)) { 072 log.warn("Schema '" + NOTE_SCHEMA + "' is not available for document " + doc); 073 return false; 074 } 075 return super.updateDocumentIfPossible(doc, content); 076 } 077 078 @Override 079 public void updateDocument(DocumentModel doc, Blob content) { 080 String string; 081 try { 082 string = getString(content); 083 } catch (IOException e) { 084 throw new NuxeoException(e); 085 } 086 doc.setProperty(NOTE_SCHEMA, NOTE_FIELD, string); 087 doc.setProperty(NOTE_SCHEMA, MT_FIELD, content.getMimeType()); 088 } 089 090 protected String getString(Blob blob) throws IOException { 091 String s = guessEncoding(blob); 092 if (s == null) { 093 s = blob.getString(); // uses default charset 094 } 095 return s; 096 } 097 098 protected static String guessEncoding(Blob blob) throws IOException { 099 // encoding already known? 100 if (blob.getEncoding() != null) { 101 return null; 102 } 103 104 // bad mime type? 105 String mimeType = blob.getMimeType(); 106 if (mimeType == null) { 107 return null; 108 } 109 if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) { 110 // not a text file, we shouldn't be in the Note importer 111 return null; 112 } 113 114 byte[] bytes = blob.getByteArray(); 115 116 List<String> charsets = new ArrayList<>(Arrays.asList("utf-8", "iso-8859-1")); 117 118 String CSEQ = "charset="; 119 int i = mimeType.indexOf(CSEQ); 120 if (i > 0) { 121 // charset specified in MIME type 122 String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim(); 123 blob.setMimeType(onlyMimeType); 124 String charset = mimeType.substring(i + CSEQ.length()); 125 i = charset.indexOf(";"); 126 if (i > 0) { 127 charset = charset.substring(0, i); 128 } 129 charset = charset.trim().replace("\"", ""); 130 charsets.add(0, charset); 131 } else { 132 // charset detected from the actual bytes 133 CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect(); 134 if (charsetMatch != null) { 135 String charset = charsetMatch.getName(); 136 charsets.add(0, charset); 137 } 138 } 139 140 // now convert the string according to the charset, and fallback on others if not possible 141 for (String charset : charsets) { 142 try { 143 Charset cs = Charset.forName(charset); 144 CharsetDecoder d = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter( 145 CodingErrorAction.REPORT); 146 CharBuffer cb = d.decode(ByteBuffer.wrap(bytes)); 147 if (cb.length() != 0 && cb.charAt(0) == '\ufeff') { 148 // remove BOM 149 cb = cb.subSequence(1, cb.length()); 150 } 151 return cb.toString(); 152 } catch (IllegalArgumentException e) { 153 // illegal charset 154 } catch (CharacterCodingException e) { 155 // could not decode 156 } 157 } 158 // nothing worked, use platform 159 return null; 160 } 161 162}