001/*
002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 *     Stephane Lacoin
019 */
020package org.nuxeo.ecm.core.storage;
021
022import java.io.IOException;
023import java.util.LinkedList;
024import java.util.List;
025
026import org.apache.commons.lang.StringUtils;
027import org.apache.commons.logging.Log;
028import org.apache.commons.logging.LogFactory;
029import org.nuxeo.ecm.core.api.Blob;
030import org.nuxeo.ecm.core.api.DocumentLocation;
031import org.nuxeo.ecm.core.api.DocumentModel;
032import org.nuxeo.ecm.core.api.IdRef;
033import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
035import org.nuxeo.ecm.core.api.impl.DocumentLocationImpl;
036import org.nuxeo.ecm.core.api.impl.blob.StringBlob;
037import org.nuxeo.ecm.core.convert.api.ConversionException;
038import org.nuxeo.ecm.core.convert.api.ConversionService;
039import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText;
040import org.nuxeo.ecm.core.utils.BlobsExtractor;
041import org.nuxeo.ecm.core.work.AbstractWork;
042import org.nuxeo.ecm.core.work.api.Work;
043import org.nuxeo.ecm.core.work.api.WorkManager;
044import org.nuxeo.runtime.api.Framework;
045
046/**
047 * Work task that does fulltext extraction from the blobs of the given document.
048 * <p>
049 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}.
050 * <p>
051 * This base abstract class must be subclassed in order to implement the proper
052 * {@link #initFulltextConfigurationAndParser} depending on the storage.
053 *
054 * @since 5.7
055 */
056public abstract class FulltextExtractorWork extends AbstractWork {
057
058    private static final long serialVersionUID = 1L;
059
060    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
061
062    protected static final String ANY2TEXT = "any2text";
063
064    protected static final String CATEGORY = "fulltextExtractor";
065
066    protected static final String TITLE = "fulltextExtractor";
067
068    protected final boolean excludeProxies;
069
070    protected transient FulltextConfiguration fulltextConfiguration;
071
072    protected transient FulltextParser fulltextParser;
073
074    public FulltextExtractorWork(String repositoryName, String docId, boolean excludeProxies) {
075        setDocument(repositoryName, docId);
076        this.excludeProxies = excludeProxies;
077    }
078
079    @Override
080    public String getCategory() {
081        return CATEGORY;
082    }
083
084    @Override
085    public String getTitle() {
086        return TITLE;
087    }
088
089    @Override
090    public int getRetryCount() {
091        // even read-only threads may encounter concurrent update exceptions
092        // when trying to read a previously deleted complex property
093        // due to read committed semantics, cf NXP-17384
094        return 1;
095    }
096
097    @Override
098    public void work() {
099        openSystemSession();
100        // if the runtime has shutdown (normally because tests are finished)
101        // this can happen, see NXP-4009
102        if (session.getPrincipal() == null) {
103            return;
104        }
105
106        initFulltextConfigurationAndParser();
107
108        setStatus("Extracting");
109        setProgress(Progress.PROGRESS_0_PC);
110        extractBinaryText();
111        setProgress(Progress.PROGRESS_100_PC);
112        setStatus("Done");
113    }
114
115    /**
116     * Initializes the fulltext configuration and parser.
117     *
118     * @since 5.9.5
119     */
120    public abstract void initFulltextConfigurationAndParser();
121
122    protected void extractBinaryText() {
123        IdRef docRef = new IdRef(docId);
124        if (!session.exists(docRef)) {
125            // doc is gone
126            return;
127        }
128        DocumentModel doc = session.getDocument(docRef);
129        if (excludeProxies && doc.isProxy()) {
130            // VCS proxies don't have any fulltext attached, it's
131            // the target document that carries it
132            return;
133        }
134        if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) {
135            // excluded by config
136            return;
137        }
138
139        // Iterate on each index to set the binaryText column
140        BlobsExtractor extractor = new BlobsExtractor();
141        DocumentLocation docLocation = new DocumentLocationImpl(doc);
142        List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>();
143        for (String indexName : fulltextConfiguration.indexNames) {
144            if (!fulltextConfiguration.indexesAllBinary.contains(indexName)
145                    && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) {
146                // nothing to do: index not configured for blob
147                continue;
148            }
149            extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName),
150                    fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName),
151                    fulltextConfiguration.indexesAllBinary.contains(indexName));
152            List<Blob> blobs = extractor.getBlobs(doc);
153            StringBlob stringBlob = blobsToStringBlob(blobs, docId);
154            String text = fulltextParser.parse(stringBlob.getString(), null, stringBlob.getMimeType(), docLocation);
155            int fullTextFieldSizeLimit = fulltextConfiguration.fulltextFieldSizeLimit;
156            if (fullTextFieldSizeLimit != 0 && text.length() > fullTextFieldSizeLimit) {
157                if (log.isDebugEnabled()) {
158                    log.debug(String.format(
159                            "Fulltext extract of length: %s for indexName: %s of document: %s truncated to length: %s",
160                            text.length(), indexName, docId, fullTextFieldSizeLimit));
161                }
162                text = text.substring(0, fullTextFieldSizeLimit);
163            }
164            indexesAndText.add(new IndexAndText(indexName, text));
165        }
166        if (!indexesAndText.isEmpty()) {
167            Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText);
168            if (!fulltextConfiguration.fulltextSearchDisabled) {
169                WorkManager workManager = Framework.getService(WorkManager.class);
170                workManager.schedule(work, true);
171            } else {
172                ((FulltextUpdaterWork)work).updateWithSession(session);
173            }
174        }
175
176    }
177
178    @Override
179    public void cleanUp(boolean ok, Exception e) {
180        super.cleanUp(ok, e);
181        fulltextConfiguration = null;
182        fulltextParser = null;
183    }
184
185    protected StringBlob blobsToStringBlob(List<Blob> blobs, String docId) {
186        String mimeType = null;
187        List<String> strings = new LinkedList<String>();
188        for (Blob blob : blobs) {
189            try {
190                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
191                BlobHolder result = convert(bh);
192                if (result == null) {
193                    continue;
194                }
195                blob = result.getBlob();
196                if (blob == null) {
197                    continue;
198                }
199                if (StringUtils.isEmpty(mimeType) && StringUtils.isNotEmpty(blob.getMimeType())) {
200                    mimeType = blob.getMimeType();
201                }
202                String string = new String(blob.getByteArray(), "UTF-8");
203                // strip '\0 chars from text
204                if (string.indexOf('\0') >= 0) {
205                    string = string.replace("\0", " ");
206                }
207                strings.add(string);
208            } catch (ConversionException | IOException e) {
209                String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId
210                        + ": " + e;
211                log.warn(msg);
212                log.debug(msg, e);
213                continue;
214            }
215        }
216        return new StringBlob(StringUtils.join(strings, " "), mimeType);
217    }
218
219    protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException {
220        ConversionService conversionService = Framework.getService(ConversionService.class);
221        if (conversionService == null) {
222            log.debug("No ConversionService available");
223            return null;
224        }
225        return conversionService.convert(ANY2TEXT, blobHolder, null);
226    }
227
228}