001/* 002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Olivier Grisel 016 */ 017package org.nuxeo.ecm.platform.categorization.service; 018 019import java.io.IOException; 020import java.io.Serializable; 021import java.util.ArrayList; 022import java.util.HashMap; 023import java.util.List; 024import java.util.Map; 025 026import org.apache.commons.logging.Log; 027import org.apache.commons.logging.LogFactory; 028import org.nuxeo.common.utils.StringUtils; 029import org.nuxeo.common.xmap.annotation.XNode; 030import org.nuxeo.common.xmap.annotation.XNodeList; 031import org.nuxeo.common.xmap.annotation.XNodeMap; 032import org.nuxeo.common.xmap.annotation.XObject; 033import org.nuxeo.ecm.core.api.DocumentModel; 034import org.nuxeo.ecm.core.api.NuxeoException; 035import org.nuxeo.ecm.core.api.PropertyException; 036import org.nuxeo.ecm.core.api.model.Property; 037import org.nuxeo.runtime.model.RuntimeContext; 038 039@XObject("categorizer") 040public class CategorizerDescriptor { 041 042 public static final Log log = LogFactory.getLog(CategorizerDescriptor.class); 043 044 protected static int DEFAULT_MAX_SUGGESTIONS = 3; 045 046 protected static int DEFAULT_MIN_TEXT_LENGTH = 50; 047 048 protected RuntimeContext runtimeContext; 049 050 @XNode("@name") 051 protected String name; 052 053 @XNode("@property") 054 protected String propertyXPath; 055 056 @XNode("@factory") 057 protected String className; 058 059 @XNode("@model") 060 protected String modelFile; 061 062 @XNode("@enabled") 063 protected boolean enabled = true; 064 065 @XNode("@maxSuggestions") 066 protected int maxSuggestions = DEFAULT_MAX_SUGGESTIONS; 067 068 @XNode("@minTextLength") 069 protected int minTextLength = DEFAULT_MIN_TEXT_LENGTH; 070 071 @XNode("@precisionThreshold") 072 protected Double precisionThreshold; 073 074 @XNodeList(value = "skip/facet@name", type = ArrayList.class, componentType = String.class) 075 public List<String> skipFacets = new ArrayList<String>(); 076 077 @XNodeMap(value = "mapping/outcome", key = "@name", type = HashMap.class, componentType = String.class) 078 Map<String, String> mapping = new HashMap<String, String>(); 079 080 protected Categorizer categorizer; 081 082 protected CategorizerFactory factory; 083 084 public String getName() { 085 return name; 086 } 087 088 public void initializeInContext(RuntimeContext context) { 089 if (className != null) { 090 try { 091 factory = (CategorizerFactory) context.loadClass(className).newInstance(); 092 } catch (ReflectiveOperationException e) { 093 throw new RuntimeException(e); 094 } 095 } 096 // if className is null, this descriptor is probably an override 097 } 098 099 public boolean isEnabled() { 100 return enabled; 101 } 102 103 public void processDocument(DocumentModel doc, String textContent) throws PropertyException { 104 if (categorizer == null) { 105 // lazy loading of the model in memory 106 categorizer = factory.loadInstance(modelFile, true); 107 } 108 109 List<String> suggestedCategories = categorizer.guessCategories(textContent, maxSuggestions, precisionThreshold); 110 log.debug(String.format("Sugestions for document '%s' and property '%s'" 111 + " with textcontent of length %d: [%s]", doc.getTitle(), propertyXPath, textContent.length(), 112 StringUtils.join(suggestedCategories, ", "))); 113 114 List<String> propertyValues = new ArrayList<String>(maxSuggestions); 115 if (!mapping.isEmpty()) { 116 for (String suggestion : suggestedCategories) { 117 String property = mapping.get(suggestion); 118 if (property != null) { 119 propertyValues.add(property); 120 } 121 } 122 } else { 123 propertyValues.addAll(suggestedCategories); 124 } 125 126 if (propertyValues.isEmpty()) { 127 return; 128 } else if (propertyValues.size() > maxSuggestions) { 129 propertyValues = propertyValues.subList(0, maxSuggestions); 130 } 131 Property property = doc.getProperty(propertyXPath); 132 if (property.isList()) { 133 doc.setPropertyValue(propertyXPath, (Serializable) propertyValues); 134 } else { 135 doc.setPropertyValue(propertyXPath, propertyValues.get(0)); 136 } 137 } 138 139 @SuppressWarnings("unchecked") 140 public boolean shouldProcess(DocumentModel doc) { 141 if (skipFacets != null) { 142 for (String facetToSkip : skipFacets) { 143 if (doc.hasFacet(facetToSkip)) { 144 return false; 145 } 146 } 147 } 148 // TODO make it possible to delegate the work to the categorizer impl 149 try { 150 Property property = doc.getProperty(propertyXPath); 151 if (property.getValue() == null) { 152 return true; 153 } 154 if (property.isList()) { 155 List<String> values = property.getValue(List.class); 156 if (values.isEmpty()) { 157 return true; 158 } 159 } else if (property.isComplex()) { 160 // TODO: use a dedicated exception class instead 161 throw new NuxeoException(propertyXPath 162 + " is a complex type field and hence is not suitable for text based categorization"); 163 } else if (property.getValue().toString().trim().length() == 0) { 164 return true; 165 } 166 } catch (PropertyException e) { 167 // document has not such property 168 return false; 169 } 170 // do not categorize document that already have a non-empty target 171 // property 172 return false; 173 } 174 175 public int getMinTextLength() { 176 return minTextLength; 177 } 178 179 public void setMinTextLength(int minTextLength) { 180 this.minTextLength = minTextLength; 181 } 182 183 /** 184 * Chainable update the parameters of the current descriptor with the non-null parameters of the other descriptor. 185 */ 186 public CategorizerDescriptor merge(CategorizerDescriptor other) { 187 if (other != null) { 188 if (other.propertyXPath != null) { 189 propertyXPath = other.propertyXPath; 190 } 191 if (other.className != null) { 192 className = other.className; 193 } 194 if (other.categorizer != null) { 195 categorizer = other.categorizer; 196 } 197 if (other.factory != null) { 198 factory = other.factory; 199 } 200 if (other.maxSuggestions != DEFAULT_MAX_SUGGESTIONS) { 201 maxSuggestions = other.maxSuggestions; 202 } 203 if (other.minTextLength != DEFAULT_MIN_TEXT_LENGTH) { 204 minTextLength = other.minTextLength; 205 } 206 if (other.precisionThreshold != null) { 207 precisionThreshold = other.precisionThreshold; 208 } 209 if (other.modelFile != null) { 210 modelFile = other.modelFile; 211 } 212 if (!other.mapping.isEmpty()) { 213 mapping = other.mapping; 214 } 215 if (!other.skipFacets.isEmpty()) { 216 skipFacets = other.skipFacets; 217 } 218 } 219 return this; 220 } 221}