001/*
002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Olivier Grisel
016 */
017package org.nuxeo.ecm.platform.categorization.service;
018
019import java.io.IOException;
020import java.io.Serializable;
021import java.util.ArrayList;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025
026import org.apache.commons.logging.Log;
027import org.apache.commons.logging.LogFactory;
028import org.nuxeo.common.utils.StringUtils;
029import org.nuxeo.common.xmap.annotation.XNode;
030import org.nuxeo.common.xmap.annotation.XNodeList;
031import org.nuxeo.common.xmap.annotation.XNodeMap;
032import org.nuxeo.common.xmap.annotation.XObject;
033import org.nuxeo.ecm.core.api.DocumentModel;
034import org.nuxeo.ecm.core.api.NuxeoException;
035import org.nuxeo.ecm.core.api.PropertyException;
036import org.nuxeo.ecm.core.api.model.Property;
037import org.nuxeo.runtime.model.RuntimeContext;
038
039@XObject("categorizer")
040public class CategorizerDescriptor {
041
042    public static final Log log = LogFactory.getLog(CategorizerDescriptor.class);
043
044    protected static int DEFAULT_MAX_SUGGESTIONS = 3;
045
046    protected static int DEFAULT_MIN_TEXT_LENGTH = 50;
047
048    protected RuntimeContext runtimeContext;
049
050    @XNode("@name")
051    protected String name;
052
053    @XNode("@property")
054    protected String propertyXPath;
055
056    @XNode("@factory")
057    protected String className;
058
059    @XNode("@model")
060    protected String modelFile;
061
062    @XNode("@enabled")
063    protected boolean enabled = true;
064
065    @XNode("@maxSuggestions")
066    protected int maxSuggestions = DEFAULT_MAX_SUGGESTIONS;
067
068    @XNode("@minTextLength")
069    protected int minTextLength = DEFAULT_MIN_TEXT_LENGTH;
070
071    @XNode("@precisionThreshold")
072    protected Double precisionThreshold;
073
074    @XNodeList(value = "skip/facet@name", type = ArrayList.class, componentType = String.class)
075    public List<String> skipFacets = new ArrayList<String>();
076
077    @XNodeMap(value = "mapping/outcome", key = "@name", type = HashMap.class, componentType = String.class)
078    Map<String, String> mapping = new HashMap<String, String>();
079
080    protected Categorizer categorizer;
081
082    protected CategorizerFactory factory;
083
084    public String getName() {
085        return name;
086    }
087
088    public void initializeInContext(RuntimeContext context) {
089        if (className != null) {
090            try {
091                factory = (CategorizerFactory) context.loadClass(className).newInstance();
092            } catch (ReflectiveOperationException e) {
093                throw new RuntimeException(e);
094            }
095        }
096        // if className is null, this descriptor is probably an override
097    }
098
099    public boolean isEnabled() {
100        return enabled;
101    }
102
103    public void processDocument(DocumentModel doc, String textContent) throws PropertyException {
104        if (categorizer == null) {
105            // lazy loading of the model in memory
106            categorizer = factory.loadInstance(modelFile, true);
107        }
108
109        List<String> suggestedCategories = categorizer.guessCategories(textContent, maxSuggestions, precisionThreshold);
110        log.debug(String.format("Sugestions for document '%s' and property '%s'"
111                + " with textcontent of length %d: [%s]", doc.getTitle(), propertyXPath, textContent.length(),
112                StringUtils.join(suggestedCategories, ", ")));
113
114        List<String> propertyValues = new ArrayList<String>(maxSuggestions);
115        if (!mapping.isEmpty()) {
116            for (String suggestion : suggestedCategories) {
117                String property = mapping.get(suggestion);
118                if (property != null) {
119                    propertyValues.add(property);
120                }
121            }
122        } else {
123            propertyValues.addAll(suggestedCategories);
124        }
125
126        if (propertyValues.isEmpty()) {
127            return;
128        } else if (propertyValues.size() > maxSuggestions) {
129            propertyValues = propertyValues.subList(0, maxSuggestions);
130        }
131        Property property = doc.getProperty(propertyXPath);
132        if (property.isList()) {
133            doc.setPropertyValue(propertyXPath, (Serializable) propertyValues);
134        } else {
135            doc.setPropertyValue(propertyXPath, propertyValues.get(0));
136        }
137    }
138
139    @SuppressWarnings("unchecked")
140    public boolean shouldProcess(DocumentModel doc) {
141        if (skipFacets != null) {
142            for (String facetToSkip : skipFacets) {
143                if (doc.hasFacet(facetToSkip)) {
144                    return false;
145                }
146            }
147        }
148        // TODO make it possible to delegate the work to the categorizer impl
149        try {
150            Property property = doc.getProperty(propertyXPath);
151            if (property.getValue() == null) {
152                return true;
153            }
154            if (property.isList()) {
155                List<String> values = property.getValue(List.class);
156                if (values.isEmpty()) {
157                    return true;
158                }
159            } else if (property.isComplex()) {
160                // TODO: use a dedicated exception class instead
161                throw new NuxeoException(propertyXPath
162                        + " is a complex type field and hence is not suitable for text based categorization");
163            } else if (property.getValue().toString().trim().length() == 0) {
164                return true;
165            }
166        } catch (PropertyException e) {
167            // document has not such property
168            return false;
169        }
170        // do not categorize document that already have a non-empty target
171        // property
172        return false;
173    }
174
175    public int getMinTextLength() {
176        return minTextLength;
177    }
178
179    public void setMinTextLength(int minTextLength) {
180        this.minTextLength = minTextLength;
181    }
182
183    /**
184     * Chainable update the parameters of the current descriptor with the non-null parameters of the other descriptor.
185     */
186    public CategorizerDescriptor merge(CategorizerDescriptor other) {
187        if (other != null) {
188            if (other.propertyXPath != null) {
189                propertyXPath = other.propertyXPath;
190            }
191            if (other.className != null) {
192                className = other.className;
193            }
194            if (other.categorizer != null) {
195                categorizer = other.categorizer;
196            }
197            if (other.factory != null) {
198                factory = other.factory;
199            }
200            if (other.maxSuggestions != DEFAULT_MAX_SUGGESTIONS) {
201                maxSuggestions = other.maxSuggestions;
202            }
203            if (other.minTextLength != DEFAULT_MIN_TEXT_LENGTH) {
204                minTextLength = other.minTextLength;
205            }
206            if (other.precisionThreshold != null) {
207                precisionThreshold = other.precisionThreshold;
208            }
209            if (other.modelFile != null) {
210                modelFile = other.modelFile;
211            }
212            if (!other.mapping.isEmpty()) {
213                mapping = other.mapping;
214            }
215            if (!other.skipFacets.isEmpty()) {
216                skipFacets = other.skipFacets;
217            }
218        }
219        return this;
220    }
221}