001/* 002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Olivier Grisel 018 * Florent Guillaume 019 */ 020package org.nuxeo.ecm.platform.filemanager.service.extension; 021 022import java.io.IOException; 023import java.nio.ByteBuffer; 024import java.nio.CharBuffer; 025import java.nio.charset.CharacterCodingException; 026import java.nio.charset.Charset; 027import java.nio.charset.CharsetDecoder; 028import java.nio.charset.CodingErrorAction; 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.List; 032 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.nuxeo.ecm.core.api.Blob; 036import org.nuxeo.ecm.core.api.DocumentModel; 037import org.nuxeo.ecm.core.api.NuxeoException; 038 039import com.ibm.icu.text.CharsetDetector; 040import com.ibm.icu.text.CharsetMatch; 041 042/** 043 * Imports the string content of a blob as text for the content of the "note" field of a new Note document. 044 * <p> 045 * If an existing document with the same title is found the existing Note document is updated instead. 046 */ 047public class NoteImporter extends AbstractFileImporter { 048 049 private static final Log log = LogFactory.getLog(NoteImporter.class); 050 051 private static final String NOTE_TYPE = "Note"; 052 053 private static final String NOTE_SCHEMA = "note"; 054 055 private static final String NOTE_FIELD = "note"; 056 057 private static final String MT_FIELD = "mime_type"; 058 059 private static final long serialVersionUID = 1L; 060 061 @Override 062 public String getDefaultDocType() { 063 return NOTE_TYPE; 064 } 065 066 @Override 067 public boolean isOverwriteByTitle() { 068 return true; 069 } 070 071 @Override 072 public boolean updateDocumentIfPossible(DocumentModel doc, Blob content) { 073 if (!doc.hasSchema(NOTE_SCHEMA)) { 074 log.warn("Schema '" + NOTE_SCHEMA + "' is not available for document " + doc); 075 return false; 076 } 077 return super.updateDocumentIfPossible(doc, content); 078 } 079 080 @Override 081 public void updateDocument(DocumentModel doc, Blob content) { 082 String string; 083 try { 084 string = getString(content); 085 } catch (IOException e) { 086 throw new NuxeoException(e); 087 } 088 doc.setProperty(NOTE_SCHEMA, NOTE_FIELD, string); 089 doc.setProperty(NOTE_SCHEMA, MT_FIELD, content.getMimeType()); 090 } 091 092 protected String getString(Blob blob) throws IOException { 093 String s = guessEncoding(blob); 094 if (s == null) { 095 s = blob.getString(); // uses default charset 096 } 097 return s; 098 } 099 100 protected static String guessEncoding(Blob blob) throws IOException { 101 // encoding already known? 102 if (blob.getEncoding() != null) { 103 return null; 104 } 105 106 // bad mime type? 107 String mimeType = blob.getMimeType(); 108 if (mimeType == null) { 109 return null; 110 } 111 if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) { 112 // not a text file, we shouldn't be in the Note importer 113 return null; 114 } 115 116 byte[] bytes = blob.getByteArray(); 117 118 List<String> charsets = new ArrayList<>(Arrays.asList("utf-8", "iso-8859-1")); 119 120 String CSEQ = "charset="; 121 int i = mimeType.indexOf(CSEQ); 122 if (i > 0) { 123 // charset specified in MIME type 124 String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim(); 125 blob.setMimeType(onlyMimeType); 126 String charset = mimeType.substring(i + CSEQ.length()); 127 i = charset.indexOf(";"); 128 if (i > 0) { 129 charset = charset.substring(0, i); 130 } 131 charset = charset.trim().replace("\"", ""); 132 charsets.add(0, charset); 133 } else { 134 // charset detected from the actual bytes 135 CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect(); 136 if (charsetMatch != null) { 137 String charset = charsetMatch.getName(); 138 charsets.add(0, charset); 139 } 140 } 141 142 // now convert the string according to the charset, and fallback on others if not possible 143 for (String charset : charsets) { 144 try { 145 Charset cs = Charset.forName(charset); 146 CharsetDecoder d = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter( 147 CodingErrorAction.REPORT); 148 CharBuffer cb = d.decode(ByteBuffer.wrap(bytes)); 149 if (cb.length() != 0 && cb.charAt(0) == '\ufeff') { 150 // remove BOM 151 cb = cb.subSequence(1, cb.length()); 152 } 153 return cb.toString(); 154 } catch (IllegalArgumentException e) { 155 // illegal charset 156 } catch (CharacterCodingException e) { 157 // could not decode 158 } 159 } 160 // nothing worked, use platform 161 return null; 162 } 163 164}