001/*
002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Olivier Grisel
016 */
017package org.nuxeo.ecm.platform.categorization.service;
018
019import java.io.IOException;
020import java.util.ArrayList;
021import java.util.LinkedHashMap;
022import java.util.LinkedHashSet;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Map;
026import java.util.Set;
027
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.nuxeo.common.utils.StringUtils;
031import org.nuxeo.ecm.core.api.Blob;
032import org.nuxeo.ecm.core.api.Blobs;
033import org.nuxeo.ecm.core.api.CoreSession;
034import org.nuxeo.ecm.core.api.DocumentModel;
035import org.nuxeo.ecm.core.api.DocumentModelList;
036import org.nuxeo.ecm.core.api.DocumentRef;
037import org.nuxeo.ecm.core.api.PropertyException;
038import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
039import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
040import org.nuxeo.ecm.core.convert.api.ConversionException;
041import org.nuxeo.ecm.core.convert.api.ConversionService;
042import org.nuxeo.ecm.core.utils.BlobsExtractor;
043import org.nuxeo.runtime.api.Framework;
044import org.nuxeo.runtime.model.DefaultComponent;
045import org.nuxeo.runtime.model.Extension;
046
047public class DocumentCategorizationServiceImpl extends DefaultComponent implements DocumentCategorizationService {
048
049    public static final String CATEGORIZERS_XP_NAME = "categorizers";
050
051    public static final String ANY2TEXT = "any2text";
052
053    private static final Log log = LogFactory.getLog(DocumentCategorizationServiceImpl.class);
054
055    protected Map<String, CategorizerDescriptor> mergedCategorizers;
056
057    protected final List<CategorizerDescriptor> registeredCategorizers = new ArrayList<CategorizerDescriptor>();
058
059    protected final BlobsExtractor extractor = new BlobsExtractor();
060
061    protected ConversionService conversionService;
062
063    @Override
064    public void registerExtension(Extension extension) {
065        if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) {
066            Object[] contribs = extension.getContributions();
067            for (Object contrib : contribs) {
068                if (contrib instanceof CategorizerDescriptor) {
069                    registerCategorizerDescriptor((CategorizerDescriptor) contrib, extension);
070                }
071            }
072        }
073    }
074
075    @Override
076    public void unregisterExtension(Extension extension) {
077        if (extension.getExtensionPoint().equals(CATEGORIZERS_XP_NAME)) {
078            Object[] contribs = extension.getContributions();
079            for (Object contrib : contribs) {
080                if (contrib instanceof CategorizerDescriptor) {
081                    unregisterCategorizerDescriptor((CategorizerDescriptor) contrib, extension);
082                }
083            }
084        }
085    }
086
087    protected void registerCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) {
088
089        descriptor.initializeInContext(extension.getContext());
090
091        // register and invalidFate merged Categorizers
092        registeredCategorizers.add(descriptor);
093        mergedCategorizers = null;
094    }
095
096    protected synchronized void unregisterCategorizerDescriptor(CategorizerDescriptor descriptor, Extension extension) {
097
098        int index = registeredCategorizers.lastIndexOf(descriptor);
099        if (index != -1) {
100            registeredCategorizers.remove(index);
101            mergedCategorizers = null;
102        } else {
103            log.warn(String.format("no registered Categorizer under name '%s'", descriptor.getName()));
104        }
105    }
106
107    protected Map<String, CategorizerDescriptor> getMergedDescriptors() {
108        if (mergedCategorizers == null) {
109            synchronized (this) {
110                if (mergedCategorizers == null) {
111                    mergedCategorizers = new LinkedHashMap<String, CategorizerDescriptor>();
112                    for (CategorizerDescriptor descriptor : registeredCategorizers) {
113                        String name = descriptor.getName();
114                        if (descriptor.isEnabled()) {
115                            CategorizerDescriptor previousDescriptor = mergedCategorizers.get(name);
116                            CategorizerDescriptor mergedDescriptor = new CategorizerDescriptor();
117                            mergedDescriptor.merge(previousDescriptor);
118                            mergedDescriptor.merge(descriptor);
119                            mergedCategorizers.put(name, mergedDescriptor);
120                        } else {
121                            mergedCategorizers.remove(name);
122                        }
123                    }
124                }
125            }
126        }
127        return mergedCategorizers;
128    }
129
130    public List<DocumentModel> updateCategories(CoreSession session, List<DocumentRef> docRefs) {
131        DocumentModelList documents = session.getDocuments(docRefs.toArray(new DocumentRef[docRefs.size()]));
132        return updateCategories(documents);
133    }
134
135    public List<DocumentModel> updateCategories(List<DocumentModel> documents) {
136
137        Set<DocumentModel> impactedDocs = new LinkedHashSet<DocumentModel>();
138
139        for (DocumentModel doc : documents) {
140            List<CategorizerDescriptor> categorizersToApply = new LinkedList<CategorizerDescriptor>();
141            for (CategorizerDescriptor categorizer : getMergedDescriptors().values()) {
142                if (categorizer.shouldProcess(doc)) {
143                    categorizersToApply.add(categorizer);
144                }
145            }
146            if (!categorizersToApply.isEmpty()) {
147                // avoid extracting the fulltext content if no categorizer to
148                // apply
149                String textContent = extractTextContent(doc);
150                for (CategorizerDescriptor categorizer : categorizersToApply) {
151                    if (textContent.length() > categorizer.getMinTextLength()) {
152                        categorizer.processDocument(doc, textContent);
153                    }
154                }
155                impactedDocs.add(doc);
156            }
157        }
158        return new ArrayList<DocumentModel>(impactedDocs);
159    }
160
161    public String extractTextContent(DocumentModel doc) {
162        List<String> strings = new LinkedList<String>();
163
164        // text properties
165        strings.add(doc.getTitle());
166        String description = doc.getProperty("dc:description").getValue(String.class);
167        if (description != null) {
168            strings.add(description);
169        }
170        // TODO: extract / factorize / reuse the SQL storage full-text indexing
171        // text extraction code
172
173        List<Blob> blobs = extractor.getBlobs(doc);
174        try {
175            String noteContent = (String) doc.getPropertyValue("note:note");
176            Blob noteBlob = Blobs.createBlob(noteContent, "text/html");
177            blobs.add(noteBlob);
178        } catch (PropertyException pe) {
179            // not a note, ignore
180        }
181
182        // binary properties
183        ConversionService conversionService = getConversionService();
184        for (Blob blob : blobs) {
185            try {
186                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
187                BlobHolder result = conversionService.convert(ANY2TEXT, bh, null);
188                if (result == null) {
189                    continue;
190                }
191                blob = result.getBlob();
192                if (blob == null) {
193                    continue;
194                }
195                String string = new String(blob.getByteArray(), "UTF-8");
196                // strip '\0 chars from text
197                if (string.indexOf('\0') >= 0) {
198                    string = string.replace("\0", " ");
199                }
200                strings.add(string);
201            } catch (ConversionException | IOException e) {
202                log.error(e.getMessage(), e);
203            }
204        }
205        return StringUtils.join(strings, "\n");
206    }
207
208    protected ConversionService getConversionService() {
209        if (conversionService == null) {
210            conversionService = Framework.getService(ConversionService.class);
211        }
212        return conversionService;
213    }
214}