001/* 002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Olivier Grisel 016 */ 017package org.nuxeo.ecm.platform.categorization.service; 018 019import java.io.IOException; 020import java.util.ArrayList; 021import java.util.LinkedHashMap; 022import java.util.LinkedHashSet; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Map; 026import java.util.Set; 027 028import org.apache.commons.logging.Log; 029import org.apache.commons.logging.LogFactory; 030import org.nuxeo.common.utils.StringUtils; 031import org.nuxeo.ecm.core.api.Blob; 032import org.nuxeo.ecm.core.api.Blobs; 033import org.nuxeo.ecm.core.api.CoreSession; 034import org.nuxeo.ecm.core.api.DocumentModel; 035import org.nuxeo.ecm.core.api.DocumentModelList; 036import org.nuxeo.ecm.core.api.DocumentRef; 037import org.nuxeo.ecm.core.api.PropertyException; 038import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 039import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 040import org.nuxeo.ecm.core.convert.api.ConversionException; 041import org.nuxeo.ecm.core.convert.api.ConversionService; 042import org.nuxeo.ecm.core.utils.BlobsExtractor; 043import org.nuxeo.runtime.api.Framework; 044import org.nuxeo.runtime.model.DefaultComponent; 045import org.nuxeo.runtime.model.Extension; 046 047public class DocumentCategorizationServiceImpl extends DefaultComponent implements DocumentCategorizationService { 048 049 public static final String CATEGORIZERS_XP_NAME = "categorizers"; 050 051 public static final String ANY2TEXT = "any2text"; 052 053 private static final Log log = LogFactory.getLog(DocumentCategorizationServiceImpl.class); 054 055 protected Map<String, CategorizerDescriptor> mergedCategorizers; 056 057 protected final List<CategorizerDescriptor> registeredCategorizers = new ArrayList<CategorizerDescriptor>(); 058 059 protected final BlobsExtractor extractor = new BlobsExtractor(); 060 061 protected ConversionService conversionService; 062 063 @Override 064 public void registerExtension(Extension extension) { 065 if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) { 066 Object[] contribs = extension.getContributions(); 067 for (Object contrib : contribs) { 068 if (contrib instanceof CategorizerDescriptor) { 069 registerCategorizerDescriptor((CategorizerDescriptor) contrib, extension); 070 } 071 } 072 } 073 } 074 075 @Override 076 public void unregisterExtension(Extension extension) { 077 if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) { 078 Object[] contribs = extension.getContributions(); 079 for (Object contrib : contribs) { 080 if (contrib instanceof CategorizerDescriptor) { 081 unregisterCategorizerDescriptor((CategorizerDescriptor) contrib, extension); 082 } 083 } 084 } 085 } 086 087 protected void registerCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) { 088 089 descriptor.initializeInContext(extension.getContext()); 090 091 // register and invalidFate merged Categorizers 092 registeredCategorizers.add(descriptor); 093 mergedCategorizers = null; 094 } 095 096 protected synchronized void unregisterCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) { 097 098 int index = registeredCategorizers.lastIndexOf(descriptor); 099 if (index != -1) { 100 registeredCategorizers.remove(index); 101 mergedCategorizers = null; 102 } else { 103 log.warn(String.format("no registered Categorizer under name '%s'", descriptor.getName())); 104 } 105 } 106 107 protected Map<String, CategorizerDescriptor> getMergedDescriptors() { 108 if (mergedCategorizers == null) { 109 synchronized (this) { 110 if (mergedCategorizers == null) { 111 mergedCategorizers = new LinkedHashMap<String, CategorizerDescriptor>(); 112 for (CategorizerDescriptor descriptor : registeredCategorizers) { 113 String name = descriptor.getName(); 114 if (descriptor.isEnabled()) { 115 CategorizerDescriptor previousDescriptor = mergedCategorizers.get(name); 116 CategorizerDescriptor mergedDescriptor = new CategorizerDescriptor(); 117 mergedDescriptor.merge(previousDescriptor); 118 mergedDescriptor.merge(descriptor); 119 mergedCategorizers.put(name, mergedDescriptor); 120 } else { 121 mergedCategorizers.remove(name); 122 } 123 } 124 } 125 } 126 } 127 return mergedCategorizers; 128 } 129 130 public List<DocumentModel> updateCategories(CoreSession session, List<DocumentRef> docRefs) { 131 DocumentModelList documents = session.getDocuments(docRefs.toArray(new DocumentRef[docRefs.size()])); 132 return updateCategories(documents); 133 } 134 135 public List<DocumentModel> updateCategories(List<DocumentModel> documents) { 136 137 Set<DocumentModel> impactedDocs = new LinkedHashSet<DocumentModel>(); 138 139 for (DocumentModel doc : documents) { 140 List<CategorizerDescriptor> categorizersToApply = new LinkedList<CategorizerDescriptor>(); 141 for (CategorizerDescriptor categorizer : getMergedDescriptors().values()) { 142 if (categorizer.shouldProcess(doc)) { 143 categorizersToApply.add(categorizer); 144 } 145 } 146 if (!categorizersToApply.isEmpty()) { 147 // avoid extracting the fulltext content if no categorizer to 148 // apply 149 String textContent = extractTextContent(doc); 150 for (CategorizerDescriptor categorizer : categorizersToApply) { 151 if (textContent.length() > categorizer.getMinTextLength()) { 152 categorizer.processDocument(doc, textContent); 153 } 154 } 155 impactedDocs.add(doc); 156 } 157 } 158 return new ArrayList<DocumentModel>(impactedDocs); 159 } 160 161 public String extractTextContent(DocumentModel doc) { 162 List<String> strings = new LinkedList<String>(); 163 164 // text properties 165 strings.add(doc.getTitle()); 166 String description = doc.getProperty("dc:description").getValue(String.class); 167 if (description != null) { 168 strings.add(description); 169 } 170 // TODO: extract / factorize / reuse the SQL storage full-text indexing 171 // text extraction code 172 173 List<Blob> blobs = extractor.getBlobs(doc); 174 try { 175 String noteContent = (String) doc.getPropertyValue("note:note"); 176 Blob noteBlob = Blobs.createBlob(noteContent, "text/html"); 177 blobs.add(noteBlob); 178 } catch (PropertyException pe) { 179 // not a note, ignore 180 } 181 182 // binary properties 183 ConversionService conversionService = getConversionService(); 184 for (Blob blob : blobs) { 185 try { 186 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 187 BlobHolder result = conversionService.convert(ANY2TEXT, bh, null); 188 if (result == null) { 189 continue; 190 } 191 blob = result.getBlob(); 192 if (blob == null) { 193 continue; 194 } 195 String string = new String(blob.getByteArray(), "UTF-8"); 196 // strip '\0 chars from text 197 if (string.indexOf('\0') >= 0) { 198 string = string.replace("\0", " "); 199 } 200 strings.add(string); 201 } catch (ConversionException | IOException e) { 202 log.error(e.getMessage(), e); 203 } 204 } 205 return StringUtils.join(strings, "\n"); 206 } 207 208 protected ConversionService getConversionService() { 209 if (conversionService == null) { 210 conversionService = Framework.getService(ConversionService.class); 211 } 212 return conversionService; 213 } 214}