001/*
002 * Copyright (c) 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the Eclipse Public License v1.0
006 * which accompanies this distribution, and is available at
007 * http://www.eclipse.org/legal/epl-v10.html
008 *
009 * Contributors:
010 *     Florent Guillaume
011 *     Stephane Lacoin
012 */
013package org.nuxeo.ecm.core.storage;
014
015import java.io.IOException;
016import java.util.LinkedList;
017import java.util.List;
018
019import org.apache.commons.lang.StringUtils;
020import org.apache.commons.logging.Log;
021import org.apache.commons.logging.LogFactory;
022import org.nuxeo.ecm.core.api.Blob;
023import org.nuxeo.ecm.core.api.DocumentModel;
024import org.nuxeo.ecm.core.api.IdRef;
025import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
026import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
027import org.nuxeo.ecm.core.convert.api.ConversionException;
028import org.nuxeo.ecm.core.convert.api.ConversionService;
029import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText;
030import org.nuxeo.ecm.core.utils.BlobsExtractor;
031import org.nuxeo.ecm.core.work.AbstractWork;
032import org.nuxeo.ecm.core.work.api.Work;
033import org.nuxeo.ecm.core.work.api.WorkManager;
034import org.nuxeo.runtime.api.Framework;
035
036/**
037 * Work task that does fulltext extraction from the blobs of the given document.
038 * <p>
039 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}.
040 * <p>
041 * This base abstract class must be subclassed in order to implement the proper
042 * {@link #initFulltextConfigurationAndParser} depending on the storage.
043 *
044 * @since 5.7
045 */
046public abstract class FulltextExtractorWork extends AbstractWork {
047
048    private static final long serialVersionUID = 1L;
049
050    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
051
052    protected static final String ANY2TEXT = "any2text";
053
054    protected static final String CATEGORY = "fulltextExtractor";
055
056    protected static final String TITLE = "fulltextExtractor";
057
058    protected final boolean excludeProxies;
059
060    protected transient FulltextConfiguration fulltextConfiguration;
061
062    protected transient FulltextParser fulltextParser;
063
064    public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) {
065        super(id);
066        setDocument(repositoryName, docId);
067        this.excludeProxies = excludeProxies;
068    }
069
070    @Override
071    public String getCategory() {
072        return CATEGORY;
073    }
074
075    @Override
076    public String getTitle() {
077        return TITLE;
078    }
079
080    @Override
081    public int getRetryCount() {
082        // even read-only threads may encounter concurrent update exceptions
083        // when trying to read a previously deleted complex property
084        // due to read committed semantics, cf NXP-17384
085        return 1;
086    }
087
088    @Override
089    public void work() {
090        initSession();
091        // if the runtime has shutdown (normally because tests are finished)
092        // this can happen, see NXP-4009
093        if (session.getPrincipal() == null) {
094            return;
095        }
096
097        initFulltextConfigurationAndParser();
098
099        setStatus("Extracting");
100        setProgress(Progress.PROGRESS_0_PC);
101        extractBinaryText();
102        setProgress(Progress.PROGRESS_100_PC);
103        setStatus("Done");
104    }
105
106    /**
107     * Initializes the fulltext configuration and parser.
108     *
109     * @since 5.9.5
110     */
111    public abstract void initFulltextConfigurationAndParser();
112
113    protected void extractBinaryText() {
114        IdRef docRef = new IdRef(docId);
115        if (!session.exists(docRef)) {
116            // doc is gone
117            return;
118        }
119        DocumentModel doc = session.getDocument(docRef);
120        if (excludeProxies && doc.isProxy()) {
121            // VCS proxies don't have any fulltext attached, it's
122            // the target document that carries it
123            return;
124        }
125        if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) {
126            // excluded by config
127            return;
128        }
129
130        // Iterate on each index to set the binaryText column
131        BlobsExtractor extractor = new BlobsExtractor();
132        List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>();
133        for (String indexName : fulltextConfiguration.indexNames) {
134            if (!fulltextConfiguration.indexesAllBinary.contains(indexName)
135                    && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) {
136                // nothing to do: index not configured for blob
137                continue;
138            }
139            extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName),
140                    fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName),
141                    fulltextConfiguration.indexesAllBinary.contains(indexName));
142            List<Blob> blobs = extractor.getBlobs(doc);
143            String text = blobsToText(blobs, docId);
144            text = fulltextParser.parse(text, null);
145            indexesAndText.add(new IndexAndText(indexName, text));
146        }
147        if (!indexesAndText.isEmpty()) {
148            Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText);
149            WorkManager workManager = Framework.getLocalService(WorkManager.class);
150            workManager.schedule(work, true);
151        }
152    }
153
154    @Override
155    public void cleanUp(boolean ok, Exception e) {
156        super.cleanUp(ok, e);
157        fulltextConfiguration = null;
158        fulltextParser = null;
159    }
160
161    protected String blobsToText(List<Blob> blobs, String docId) {
162        List<String> strings = new LinkedList<String>();
163        for (Blob blob : blobs) {
164            try {
165                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
166                BlobHolder result = convert(bh);
167                if (result == null) {
168                    continue;
169                }
170                blob = result.getBlob();
171                if (blob == null) {
172                    continue;
173                }
174                String string = new String(blob.getByteArray(), "UTF-8");
175                // strip '\0 chars from text
176                if (string.indexOf('\0') >= 0) {
177                    string = string.replace("\0", " ");
178                }
179                strings.add(string);
180            } catch (ConversionException | IOException e) {
181                String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId
182                        + ": " + e;
183                log.warn(msg);
184                log.debug(msg, e);
185                continue;
186            }
187        }
188        return StringUtils.join(strings, " ");
189    }
190
191    protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException {
192        ConversionService conversionService = Framework.getLocalService(ConversionService.class);
193        if (conversionService == null) {
194            log.debug("No ConversionService available");
195            return null;
196        }
197        return conversionService.convert(ANY2TEXT, blobHolder, null);
198    }
199
200}