Source code

001/*
002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Olivier Grisel
018 */
019package org.nuxeo.ecm.platform.categorization.service;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.LinkedHashMap;
024import java.util.LinkedHashSet;
025import java.util.LinkedList;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.nuxeo.common.utils.StringUtils;
033import org.nuxeo.ecm.core.api.Blob;
034import org.nuxeo.ecm.core.api.Blobs;
035import org.nuxeo.ecm.core.api.CoreSession;
036import org.nuxeo.ecm.core.api.DocumentModel;
037import org.nuxeo.ecm.core.api.DocumentModelList;
038import org.nuxeo.ecm.core.api.DocumentRef;
039import org.nuxeo.ecm.core.api.PropertyException;
040import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
041import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
042import org.nuxeo.ecm.core.convert.api.ConversionException;
043import org.nuxeo.ecm.core.convert.api.ConversionService;
044import org.nuxeo.ecm.core.utils.BlobsExtractor;
045import org.nuxeo.runtime.api.Framework;
046import org.nuxeo.runtime.model.DefaultComponent;
047import org.nuxeo.runtime.model.Extension;
048
049public class DocumentCategorizationServiceImpl extends DefaultComponent implements DocumentCategorizationService {
050
051    public static final String CATEGORIZERS_XP_NAME = "categorizers";
052
053    public static final String ANY2TEXT = "any2text";
054
055    private static final Log log = LogFactory.getLog(DocumentCategorizationServiceImpl.class);
056
057    protected Map<String, CategorizerDescriptor> mergedCategorizers;
058
059    protected final List<CategorizerDescriptor> registeredCategorizers = new ArrayList<CategorizerDescriptor>();
060
061    protected final BlobsExtractor extractor = new BlobsExtractor();
062
063    protected ConversionService conversionService;
064
065    @Override
066    public void registerExtension(Extension extension) {
067        if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) {
068            Object[] contribs = extension.getContributions();
069            for (Object contrib : contribs) {
070                if (contrib instanceof CategorizerDescriptor) {
071                    registerCategorizerDescriptor((CategorizerDescriptor) contrib, extension);
072                }
073            }
074        }
075    }
076
077    @Override
078    public void unregisterExtension(Extension extension) {
079        if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) {
080            Object[] contribs = extension.getContributions();
081            for (Object contrib : contribs) {
082                if (contrib instanceof CategorizerDescriptor) {
083                    unregisterCategorizerDescriptor((CategorizerDescriptor) contrib, extension);
084                }
085            }
086        }
087    }
088
089    protected void registerCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) {
090
091        descriptor.initializeInContext(extension.getContext());
092
093        // register and invalidFate merged Categorizers
094        registeredCategorizers.add(descriptor);
095        mergedCategorizers = null;
096    }
097
098    protected synchronized void unregisterCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) {
099
100        int index = registeredCategorizers.lastIndexOf(descriptor);
101        if (index != -1) {
102            registeredCategorizers.remove(index);
103            mergedCategorizers = null;
104        } else {
105            log.warn(String.format("no registered Categorizer under name '%s'", descriptor.getName()));
106        }
107    }
108
109    protected Map<String, CategorizerDescriptor> getMergedDescriptors() {
110        if (mergedCategorizers == null) {
111            synchronized (this) {
112                if (mergedCategorizers == null) {
113                    mergedCategorizers = new LinkedHashMap<String, CategorizerDescriptor>();
114                    for (CategorizerDescriptor descriptor : registeredCategorizers) {
115                        String name = descriptor.getName();
116                        if (descriptor.isEnabled()) {
117                            CategorizerDescriptor previousDescriptor = mergedCategorizers.get(name);
118                            CategorizerDescriptor mergedDescriptor = new CategorizerDescriptor();
119                            mergedDescriptor.merge(previousDescriptor);
120                            mergedDescriptor.merge(descriptor);
121                            mergedCategorizers.put(name, mergedDescriptor);
122                        } else {
123                            mergedCategorizers.remove(name);
124                        }
125                    }
126                }
127            }
128        }
129        return mergedCategorizers;
130    }
131
132    public List<DocumentModel> updateCategories(CoreSession session, List<DocumentRef> docRefs) {
133        DocumentModelList documents = session.getDocuments(docRefs.toArray(new DocumentRef[docRefs.size()]));
134        return updateCategories(documents);
135    }
136
137    public List<DocumentModel> updateCategories(List<DocumentModel> documents) {
138
139        Set<DocumentModel> impactedDocs = new LinkedHashSet<DocumentModel>();
140
141        for (DocumentModel doc : documents) {
142            List<CategorizerDescriptor> categorizersToApply = new LinkedList<CategorizerDescriptor>();
143            for (CategorizerDescriptor categorizer : getMergedDescriptors().values()) {
144                if (categorizer.shouldProcess(doc)) {
145                    categorizersToApply.add(categorizer);
146                }
147            }
148            if (!categorizersToApply.isEmpty()) {
149                // avoid extracting the fulltext content if no categorizer to
150                // apply
151                String textContent = extractTextContent(doc);
152                for (CategorizerDescriptor categorizer : categorizersToApply) {
153                    if (textContent.length() > categorizer.getMinTextLength()) {
154                        categorizer.processDocument(doc, textContent);
155                    }
156                }
157                impactedDocs.add(doc);
158            }
159        }
160        return new ArrayList<DocumentModel>(impactedDocs);
161    }
162
163    public String extractTextContent(DocumentModel doc) {
164        List<String> strings = new LinkedList<String>();
165
166        // text properties
167        strings.add(doc.getTitle());
168        String description = doc.getProperty("dc:description").getValue(String.class);
169        if (description != null) {
170            strings.add(description);
171        }
172        // TODO: extract / factorize / reuse the SQL storage full-text indexing
173        // text extraction code
174
175        List<Blob> blobs = extractor.getBlobs(doc);
176        try {
177            String noteContent = (String) doc.getPropertyValue("note:note");
178            Blob noteBlob = Blobs.createBlob(noteContent, "text/html");
179            blobs.add(noteBlob);
180        } catch (PropertyException pe) {
181            // not a note, ignore
182        }
183
184        // binary properties
185        ConversionService conversionService = getConversionService();
186        for (Blob blob : blobs) {
187            try {
188                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
189                BlobHolder result = conversionService.convert(ANY2TEXT, bh, null);
190                if (result == null) {
191                    continue;
192                }
193                blob = result.getBlob();
194                if (blob == null) {
195                    continue;
196                }
197                String string = new String(blob.getByteArray(), "UTF-8");
198                // strip '\0 chars from text
199                if (string.indexOf('\0') >= 0) {
200                    string = string.replace("\0", " ");
201                }
202                strings.add(string);
203            } catch (ConversionException | IOException e) {
204                log.error(e.getMessage(), e);
205            }
206        }
207        return StringUtils.join(strings, "\n");
208    }
209
210    protected ConversionService getConversionService() {
211        if (conversionService == null) {
212            conversionService = Framework.getService(ConversionService.class);
213        }
214        return conversionService;
215    }
216}