001/*
002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 *     Stephane Lacoin
019 */
020package org.nuxeo.ecm.core.storage;
021
022import java.io.IOException;
023import java.util.LinkedList;
024import java.util.List;
025
026import org.apache.commons.lang.StringUtils;
027import org.apache.commons.logging.Log;
028import org.apache.commons.logging.LogFactory;
029import org.nuxeo.ecm.core.api.Blob;
030import org.nuxeo.ecm.core.api.DocumentModel;
031import org.nuxeo.ecm.core.api.IdRef;
032import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
033import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
034import org.nuxeo.ecm.core.convert.api.ConversionException;
035import org.nuxeo.ecm.core.convert.api.ConversionService;
036import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText;
037import org.nuxeo.ecm.core.utils.BlobsExtractor;
038import org.nuxeo.ecm.core.work.AbstractWork;
039import org.nuxeo.ecm.core.work.api.Work;
040import org.nuxeo.ecm.core.work.api.WorkManager;
041import org.nuxeo.runtime.api.Framework;
042
043/**
044 * Work task that does fulltext extraction from the blobs of the given document.
045 * <p>
046 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}.
047 * <p>
048 * This base abstract class must be subclassed in order to implement the proper
049 * {@link #initFulltextConfigurationAndParser} depending on the storage.
050 *
051 * @since 5.7
052 */
053public abstract class FulltextExtractorWork extends AbstractWork {
054
055    private static final long serialVersionUID = 1L;
056
057    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
058
059    protected static final String ANY2TEXT = "any2text";
060
061    protected static final String CATEGORY = "fulltextExtractor";
062
063    protected static final String TITLE = "fulltextExtractor";
064
065    protected final boolean excludeProxies;
066
067    protected transient FulltextConfiguration fulltextConfiguration;
068
069    protected transient FulltextParser fulltextParser;
070
071    public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) {
072        super(id);
073        setDocument(repositoryName, docId);
074        this.excludeProxies = excludeProxies;
075    }
076
077    @Override
078    public String getCategory() {
079        return CATEGORY;
080    }
081
082    @Override
083    public String getTitle() {
084        return TITLE;
085    }
086
087    @Override
088    public int getRetryCount() {
089        // even read-only threads may encounter concurrent update exceptions
090        // when trying to read a previously deleted complex property
091        // due to read committed semantics, cf NXP-17384
092        return 1;
093    }
094
095    @Override
096    public void work() {
097        openSystemSession();
098        // if the runtime has shutdown (normally because tests are finished)
099        // this can happen, see NXP-4009
100        if (session.getPrincipal() == null) {
101            return;
102        }
103
104        initFulltextConfigurationAndParser();
105
106        setStatus("Extracting");
107        setProgress(Progress.PROGRESS_0_PC);
108        extractBinaryText();
109        setProgress(Progress.PROGRESS_100_PC);
110        setStatus("Done");
111    }
112
113    /**
114     * Initializes the fulltext configuration and parser.
115     *
116     * @since 5.9.5
117     */
118    public abstract void initFulltextConfigurationAndParser();
119
120    protected void extractBinaryText() {
121        IdRef docRef = new IdRef(docId);
122        if (!session.exists(docRef)) {
123            // doc is gone
124            return;
125        }
126        DocumentModel doc = session.getDocument(docRef);
127        if (excludeProxies && doc.isProxy()) {
128            // VCS proxies don't have any fulltext attached, it's
129            // the target document that carries it
130            return;
131        }
132        if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) {
133            // excluded by config
134            return;
135        }
136
137        // Iterate on each index to set the binaryText column
138        BlobsExtractor extractor = new BlobsExtractor();
139        List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>();
140        for (String indexName : fulltextConfiguration.indexNames) {
141            if (!fulltextConfiguration.indexesAllBinary.contains(indexName)
142                    && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) {
143                // nothing to do: index not configured for blob
144                continue;
145            }
146            extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName),
147                    fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName),
148                    fulltextConfiguration.indexesAllBinary.contains(indexName));
149            List<Blob> blobs = extractor.getBlobs(doc);
150            String text = blobsToText(blobs, docId);
151            text = fulltextParser.parse(text, null);
152            indexesAndText.add(new IndexAndText(indexName, text));
153        }
154        if (!indexesAndText.isEmpty()) {
155            Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText);
156            WorkManager workManager = Framework.getLocalService(WorkManager.class);
157            workManager.schedule(work, true);
158        }
159    }
160
161    @Override
162    public void cleanUp(boolean ok, Exception e) {
163        super.cleanUp(ok, e);
164        fulltextConfiguration = null;
165        fulltextParser = null;
166    }
167
168    protected String blobsToText(List<Blob> blobs, String docId) {
169        List<String> strings = new LinkedList<String>();
170        for (Blob blob : blobs) {
171            try {
172                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
173                BlobHolder result = convert(bh);
174                if (result == null) {
175                    continue;
176                }
177                blob = result.getBlob();
178                if (blob == null) {
179                    continue;
180                }
181                String string = new String(blob.getByteArray(), "UTF-8");
182                // strip '\0 chars from text
183                if (string.indexOf('\0') >= 0) {
184                    string = string.replace("\0", " ");
185                }
186                strings.add(string);
187            } catch (ConversionException | IOException e) {
188                String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId
189                        + ": " + e;
190                log.warn(msg);
191                log.debug(msg, e);
192                continue;
193            }
194        }
195        return StringUtils.join(strings, " ");
196    }
197
198    protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException {
199        ConversionService conversionService = Framework.getLocalService(ConversionService.class);
200        if (conversionService == null) {
201            log.debug("No ConversionService available");
202            return null;
203        }
204        return conversionService.convert(ANY2TEXT, blobHolder, null);
205    }
206
207}