001/* 002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Olivier Grisel 018 */ 019package org.nuxeo.ecm.platform.categorization.service; 020 021import java.io.IOException; 022import java.util.ArrayList; 023import java.util.LinkedHashMap; 024import java.util.LinkedHashSet; 025import java.util.LinkedList; 026import java.util.List; 027import java.util.Map; 028import java.util.Set; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.nuxeo.common.utils.StringUtils; 033import org.nuxeo.ecm.core.api.Blob; 034import org.nuxeo.ecm.core.api.Blobs; 035import org.nuxeo.ecm.core.api.CoreSession; 036import org.nuxeo.ecm.core.api.DocumentModel; 037import org.nuxeo.ecm.core.api.DocumentModelList; 038import org.nuxeo.ecm.core.api.DocumentRef; 039import org.nuxeo.ecm.core.api.PropertyException; 040import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 041import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 042import org.nuxeo.ecm.core.convert.api.ConversionException; 043import org.nuxeo.ecm.core.convert.api.ConversionService; 044import org.nuxeo.ecm.core.utils.BlobsExtractor; 045import org.nuxeo.runtime.api.Framework; 046import org.nuxeo.runtime.model.DefaultComponent; 047import org.nuxeo.runtime.model.Extension; 048 049public class DocumentCategorizationServiceImpl extends DefaultComponent implements DocumentCategorizationService { 050 051 public static final String CATEGORIZERS_XP_NAME = "categorizers"; 052 053 public static final String ANY2TEXT = "any2text"; 054 055 private static final Log log = LogFactory.getLog(DocumentCategorizationServiceImpl.class); 056 057 protected Map<String, CategorizerDescriptor> mergedCategorizers; 058 059 protected final List<CategorizerDescriptor> registeredCategorizers = new ArrayList<CategorizerDescriptor>(); 060 061 protected final BlobsExtractor extractor = new BlobsExtractor(); 062 063 protected ConversionService conversionService; 064 065 @Override 066 public void registerExtension(Extension extension) { 067 if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) { 068 Object[] contribs = extension.getContributions(); 069 for (Object contrib : contribs) { 070 if (contrib instanceof CategorizerDescriptor) { 071 registerCategorizerDescriptor((CategorizerDescriptor) contrib, extension); 072 } 073 } 074 } 075 } 076 077 @Override 078 public void unregisterExtension(Extension extension) { 079 if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) { 080 Object[] contribs = extension.getContributions(); 081 for (Object contrib : contribs) { 082 if (contrib instanceof CategorizerDescriptor) { 083 unregisterCategorizerDescriptor((CategorizerDescriptor) contrib, extension); 084 } 085 } 086 } 087 } 088 089 protected void registerCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) { 090 091 descriptor.initializeInContext(extension.getContext()); 092 093 // register and invalidFate merged Categorizers 094 registeredCategorizers.add(descriptor); 095 mergedCategorizers = null; 096 } 097 098 protected synchronized void unregisterCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) { 099 100 int index = registeredCategorizers.lastIndexOf(descriptor); 101 if (index != -1) { 102 registeredCategorizers.remove(index); 103 mergedCategorizers = null; 104 } else { 105 log.warn(String.format("no registered Categorizer under name '%s'", descriptor.getName())); 106 } 107 } 108 109 protected Map<String, CategorizerDescriptor> getMergedDescriptors() { 110 if (mergedCategorizers == null) { 111 synchronized (this) { 112 if (mergedCategorizers == null) { 113 mergedCategorizers = new LinkedHashMap<String, CategorizerDescriptor>(); 114 for (CategorizerDescriptor descriptor : registeredCategorizers) { 115 String name = descriptor.getName(); 116 if (descriptor.isEnabled()) { 117 CategorizerDescriptor previousDescriptor = mergedCategorizers.get(name); 118 CategorizerDescriptor mergedDescriptor = new CategorizerDescriptor(); 119 mergedDescriptor.merge(previousDescriptor); 120 mergedDescriptor.merge(descriptor); 121 mergedCategorizers.put(name, mergedDescriptor); 122 } else { 123 mergedCategorizers.remove(name); 124 } 125 } 126 } 127 } 128 } 129 return mergedCategorizers; 130 } 131 132 public List<DocumentModel> updateCategories(CoreSession session, List<DocumentRef> docRefs) { 133 DocumentModelList documents = session.getDocuments(docRefs.toArray(new DocumentRef[docRefs.size()])); 134 return updateCategories(documents); 135 } 136 137 public List<DocumentModel> updateCategories(List<DocumentModel> documents) { 138 139 Set<DocumentModel> impactedDocs = new LinkedHashSet<DocumentModel>(); 140 141 for (DocumentModel doc : documents) { 142 List<CategorizerDescriptor> categorizersToApply = new LinkedList<CategorizerDescriptor>(); 143 for (CategorizerDescriptor categorizer : getMergedDescriptors().values()) { 144 if (categorizer.shouldProcess(doc)) { 145 categorizersToApply.add(categorizer); 146 } 147 } 148 if (!categorizersToApply.isEmpty()) { 149 // avoid extracting the fulltext content if no categorizer to 150 // apply 151 String textContent = extractTextContent(doc); 152 for (CategorizerDescriptor categorizer : categorizersToApply) { 153 if (textContent.length() > categorizer.getMinTextLength()) { 154 categorizer.processDocument(doc, textContent); 155 } 156 } 157 impactedDocs.add(doc); 158 } 159 } 160 return new ArrayList<DocumentModel>(impactedDocs); 161 } 162 163 public String extractTextContent(DocumentModel doc) { 164 List<String> strings = new LinkedList<String>(); 165 166 // text properties 167 strings.add(doc.getTitle()); 168 String description = doc.getProperty("dc:description").getValue(String.class); 169 if (description != null) { 170 strings.add(description); 171 } 172 // TODO: extract / factorize / reuse the SQL storage full-text indexing 173 // text extraction code 174 175 List<Blob> blobs = extractor.getBlobs(doc); 176 try { 177 String noteContent = (String) doc.getPropertyValue("note:note"); 178 Blob noteBlob = Blobs.createBlob(noteContent, "text/html"); 179 blobs.add(noteBlob); 180 } catch (PropertyException pe) { 181 // not a note, ignore 182 } 183 184 // binary properties 185 ConversionService conversionService = getConversionService(); 186 for (Blob blob : blobs) { 187 try { 188 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 189 BlobHolder result = conversionService.convert(ANY2TEXT, bh, null); 190 if (result == null) { 191 continue; 192 } 193 blob = result.getBlob(); 194 if (blob == null) { 195 continue; 196 } 197 String string = new String(blob.getByteArray(), "UTF-8"); 198 // strip '\0 chars from text 199 if (string.indexOf('\0') >= 0) { 200 string = string.replace("\0", " "); 201 } 202 strings.add(string); 203 } catch (ConversionException | IOException e) { 204 log.error(e.getMessage(), e); 205 } 206 } 207 return StringUtils.join(strings, "\n"); 208 } 209 210 protected ConversionService getConversionService() { 211 if (conversionService == null) { 212 conversionService = Framework.getService(ConversionService.class); 213 } 214 return conversionService; 215 } 216}