001/* 002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Olivier Grisel 018 */ 019package org.nuxeo.ecm.platform.categorization.service; 020 021import java.io.IOException; 022import java.io.Serializable; 023import java.util.ArrayList; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import org.apache.commons.logging.Log; 029import org.apache.commons.logging.LogFactory; 030import org.nuxeo.common.utils.StringUtils; 031import org.nuxeo.common.xmap.annotation.XNode; 032import org.nuxeo.common.xmap.annotation.XNodeList; 033import org.nuxeo.common.xmap.annotation.XNodeMap; 034import org.nuxeo.common.xmap.annotation.XObject; 035import org.nuxeo.ecm.core.api.DocumentModel; 036import org.nuxeo.ecm.core.api.NuxeoException; 037import org.nuxeo.ecm.core.api.PropertyException; 038import org.nuxeo.ecm.core.api.model.Property; 039import org.nuxeo.runtime.model.RuntimeContext; 040 041@XObject("categorizer") 042public class CategorizerDescriptor { 043 044 public static final Log log = LogFactory.getLog(CategorizerDescriptor.class); 045 046 protected static int DEFAULT_MAX_SUGGESTIONS = 3; 047 048 protected static int DEFAULT_MIN_TEXT_LENGTH = 50; 049 050 protected RuntimeContext runtimeContext; 051 052 @XNode("@name") 053 protected String name; 054 055 @XNode("@property") 056 protected String propertyXPath; 057 058 @XNode("@factory") 059 protected String className; 060 061 @XNode("@model") 062 protected String modelFile; 063 064 @XNode("@enabled") 065 protected boolean enabled = true; 066 067 @XNode("@maxSuggestions") 068 protected int maxSuggestions = DEFAULT_MAX_SUGGESTIONS; 069 070 @XNode("@minTextLength") 071 protected int minTextLength = DEFAULT_MIN_TEXT_LENGTH; 072 073 @XNode("@precisionThreshold") 074 protected Double precisionThreshold; 075 076 @XNodeList(value = "skip/facet@name", type = ArrayList.class, componentType = String.class) 077 public List<String> skipFacets = new ArrayList<String>(); 078 079 @XNodeMap(value = "mapping/outcome", key = "@name", type = HashMap.class, componentType = String.class) 080 Map<String, String> mapping = new HashMap<String, String>(); 081 082 protected Categorizer categorizer; 083 084 protected CategorizerFactory factory; 085 086 public String getName() { 087 return name; 088 } 089 090 public void initializeInContext(RuntimeContext context) { 091 if (className != null) { 092 try { 093 factory = (CategorizerFactory) context.loadClass(className).newInstance(); 094 } catch (ReflectiveOperationException e) { 095 throw new RuntimeException(e); 096 } 097 } 098 // if className is null, this descriptor is probably an override 099 } 100 101 public boolean isEnabled() { 102 return enabled; 103 } 104 105 public void processDocument(DocumentModel doc, String textContent) throws PropertyException { 106 if (categorizer == null) { 107 // lazy loading of the model in memory 108 categorizer = factory.loadInstance(modelFile, true); 109 } 110 111 List<String> suggestedCategories = categorizer.guessCategories(textContent, maxSuggestions, precisionThreshold); 112 log.debug(String.format("Sugestions for document '%s' and property '%s'" 113 + " with textcontent of length %d: [%s]", doc.getTitle(), propertyXPath, textContent.length(), 114 StringUtils.join(suggestedCategories, ", "))); 115 116 List<String> propertyValues = new ArrayList<String>(maxSuggestions); 117 if (!mapping.isEmpty()) { 118 for (String suggestion : suggestedCategories) { 119 String property = mapping.get(suggestion); 120 if (property != null) { 121 propertyValues.add(property); 122 } 123 } 124 } else { 125 propertyValues.addAll(suggestedCategories); 126 } 127 128 if (propertyValues.isEmpty()) { 129 return; 130 } else if (propertyValues.size() > maxSuggestions) { 131 propertyValues = propertyValues.subList(0, maxSuggestions); 132 } 133 Property property = doc.getProperty(propertyXPath); 134 if (property.isList()) { 135 doc.setPropertyValue(propertyXPath, (Serializable) propertyValues); 136 } else { 137 doc.setPropertyValue(propertyXPath, propertyValues.get(0)); 138 } 139 } 140 141 @SuppressWarnings("unchecked") 142 public boolean shouldProcess(DocumentModel doc) { 143 if (skipFacets != null) { 144 for (String facetToSkip : skipFacets) { 145 if (doc.hasFacet(facetToSkip)) { 146 return false; 147 } 148 } 149 } 150 // TODO make it possible to delegate the work to the categorizer impl 151 try { 152 Property property = doc.getProperty(propertyXPath); 153 if (property.getValue() == null) { 154 return true; 155 } 156 if (property.isList()) { 157 List<String> values = property.getValue(List.class); 158 if (values.isEmpty()) { 159 return true; 160 } 161 } else if (property.isComplex()) { 162 // TODO: use a dedicated exception class instead 163 throw new NuxeoException(propertyXPath 164 + " is a complex type field and hence is not suitable for text based categorization"); 165 } else if (property.getValue().toString().trim().length() == 0) { 166 return true; 167 } 168 } catch (PropertyException e) { 169 // document has not such property 170 return false; 171 } 172 // do not categorize document that already have a non-empty target 173 // property 174 return false; 175 } 176 177 public int getMinTextLength() { 178 return minTextLength; 179 } 180 181 public void setMinTextLength(int minTextLength) { 182 this.minTextLength = minTextLength; 183 } 184 185 /** 186 * Chainable update the parameters of the current descriptor with the non-null parameters of the other descriptor. 187 */ 188 public CategorizerDescriptor merge(CategorizerDescriptor other) { 189 if (other != null) { 190 if (other.propertyXPath != null) { 191 propertyXPath = other.propertyXPath; 192 } 193 if (other.className != null) { 194 className = other.className; 195 } 196 if (other.categorizer != null) { 197 categorizer = other.categorizer; 198 } 199 if (other.factory != null) { 200 factory = other.factory; 201 } 202 if (other.maxSuggestions != DEFAULT_MAX_SUGGESTIONS) { 203 maxSuggestions = other.maxSuggestions; 204 } 205 if (other.minTextLength != DEFAULT_MIN_TEXT_LENGTH) { 206 minTextLength = other.minTextLength; 207 } 208 if (other.precisionThreshold != null) { 209 precisionThreshold = other.precisionThreshold; 210 } 211 if (other.modelFile != null) { 212 modelFile = other.modelFile; 213 } 214 if (!other.mapping.isEmpty()) { 215 mapping = other.mapping; 216 } 217 if (!other.skipFacets.isEmpty()) { 218 skipFacets = other.skipFacets; 219 } 220 } 221 return this; 222 } 223}