001/*
002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Olivier Grisel
018 */
019package org.nuxeo.ecm.platform.categorization.service;
020
021import java.io.IOException;
022import java.io.Serializable;
023import java.util.ArrayList;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.nuxeo.common.utils.StringUtils;
031import org.nuxeo.common.xmap.annotation.XNode;
032import org.nuxeo.common.xmap.annotation.XNodeList;
033import org.nuxeo.common.xmap.annotation.XNodeMap;
034import org.nuxeo.common.xmap.annotation.XObject;
035import org.nuxeo.ecm.core.api.DocumentModel;
036import org.nuxeo.ecm.core.api.NuxeoException;
037import org.nuxeo.ecm.core.api.PropertyException;
038import org.nuxeo.ecm.core.api.model.Property;
039import org.nuxeo.runtime.model.RuntimeContext;
040
041@XObject("categorizer")
042public class CategorizerDescriptor {
043
044    public static final Log log = LogFactory.getLog(CategorizerDescriptor.class);
045
046    protected static int DEFAULT_MAX_SUGGESTIONS = 3;
047
048    protected static int DEFAULT_MIN_TEXT_LENGTH = 50;
049
050    protected RuntimeContext runtimeContext;
051
052    @XNode("@name")
053    protected String name;
054
055    @XNode("@property")
056    protected String propertyXPath;
057
058    @XNode("@factory")
059    protected String className;
060
061    @XNode("@model")
062    protected String modelFile;
063
064    @XNode("@enabled")
065    protected boolean enabled = true;
066
067    @XNode("@maxSuggestions")
068    protected int maxSuggestions = DEFAULT_MAX_SUGGESTIONS;
069
070    @XNode("@minTextLength")
071    protected int minTextLength = DEFAULT_MIN_TEXT_LENGTH;
072
073    @XNode("@precisionThreshold")
074    protected Double precisionThreshold;
075
076    @XNodeList(value = "skip/facet@name", type = ArrayList.class, componentType = String.class)
077    public List<String> skipFacets = new ArrayList<String>();
078
079    @XNodeMap(value = "mapping/outcome", key = "@name", type = HashMap.class, componentType = String.class)
080    Map<String, String> mapping = new HashMap<String, String>();
081
082    protected Categorizer categorizer;
083
084    protected CategorizerFactory factory;
085
086    public String getName() {
087        return name;
088    }
089
090    public void initializeInContext(RuntimeContext context) {
091        if (className != null) {
092            try {
093                factory = (CategorizerFactory) context.loadClass(className).newInstance();
094            } catch (ReflectiveOperationException e) {
095                throw new RuntimeException(e);
096            }
097        }
098        // if className is null, this descriptor is probably an override
099    }
100
101    public boolean isEnabled() {
102        return enabled;
103    }
104
105    public void processDocument(DocumentModel doc, String textContent) throws PropertyException {
106        if (categorizer == null) {
107            // lazy loading of the model in memory
108            categorizer = factory.loadInstance(modelFile, true);
109        }
110
111        List<String> suggestedCategories = categorizer.guessCategories(textContent, maxSuggestions, precisionThreshold);
112        log.debug(String.format("Sugestions for document '%s' and property '%s'"
113                + " with textcontent of length %d: [%s]", doc.getTitle(), propertyXPath, textContent.length(),
114                StringUtils.join(suggestedCategories, ", ")));
115
116        List<String> propertyValues = new ArrayList<String>(maxSuggestions);
117        if (!mapping.isEmpty()) {
118            for (String suggestion : suggestedCategories) {
119                String property = mapping.get(suggestion);
120                if (property != null) {
121                    propertyValues.add(property);
122                }
123            }
124        } else {
125            propertyValues.addAll(suggestedCategories);
126        }
127
128        if (propertyValues.isEmpty()) {
129            return;
130        } else if (propertyValues.size() > maxSuggestions) {
131            propertyValues = propertyValues.subList(0, maxSuggestions);
132        }
133        Property property = doc.getProperty(propertyXPath);
134        if (property.isList()) {
135            doc.setPropertyValue(propertyXPath, (Serializable) propertyValues);
136        } else {
137            doc.setPropertyValue(propertyXPath, propertyValues.get(0));
138        }
139    }
140
141    @SuppressWarnings("unchecked")
142    public boolean shouldProcess(DocumentModel doc) {
143        if (skipFacets != null) {
144            for (String facetToSkip : skipFacets) {
145                if (doc.hasFacet(facetToSkip)) {
146                    return false;
147                }
148            }
149        }
150        // TODO make it possible to delegate the work to the categorizer impl
151        try {
152            Property property = doc.getProperty(propertyXPath);
153            if (property.getValue() == null) {
154                return true;
155            }
156            if (property.isList()) {
157                List<String> values = property.getValue(List.class);
158                if (values.isEmpty()) {
159                    return true;
160                }
161            } else if (property.isComplex()) {
162                // TODO: use a dedicated exception class instead
163                throw new NuxeoException(propertyXPath
164                        + " is a complex type field and hence is not suitable for text based categorization");
165            } else if (property.getValue().toString().trim().length() == 0) {
166                return true;
167            }
168        } catch (PropertyException e) {
169            // document has not such property
170            return false;
171        }
172        // do not categorize document that already have a non-empty target
173        // property
174        return false;
175    }
176
177    public int getMinTextLength() {
178        return minTextLength;
179    }
180
181    public void setMinTextLength(int minTextLength) {
182        this.minTextLength = minTextLength;
183    }
184
185    /**
186     * Chainable update the parameters of the current descriptor with the non-null parameters of the other descriptor.
187     */
188    public CategorizerDescriptor merge(CategorizerDescriptor other) {
189        if (other != null) {
190            if (other.propertyXPath != null) {
191                propertyXPath = other.propertyXPath;
192            }
193            if (other.className != null) {
194                className = other.className;
195            }
196            if (other.categorizer != null) {
197                categorizer = other.categorizer;
198            }
199            if (other.factory != null) {
200                factory = other.factory;
201            }
202            if (other.maxSuggestions != DEFAULT_MAX_SUGGESTIONS) {
203                maxSuggestions = other.maxSuggestions;
204            }
205            if (other.minTextLength != DEFAULT_MIN_TEXT_LENGTH) {
206                minTextLength = other.minTextLength;
207            }
208            if (other.precisionThreshold != null) {
209                precisionThreshold = other.precisionThreshold;
210            }
211            if (other.modelFile != null) {
212                modelFile = other.modelFile;
213            }
214            if (!other.mapping.isEmpty()) {
215                mapping = other.mapping;
216            }
217            if (!other.skipFacets.isEmpty()) {
218                skipFacets = other.skipFacets;
219            }
220        }
221        return this;
222    }
223}