001/*
002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 *     Stephane Lacoin
019 */
020package org.nuxeo.ecm.core.storage;
021
022import java.io.IOException;
023import java.util.LinkedList;
024import java.util.List;
025
026import org.apache.commons.lang.StringUtils;
027import org.apache.commons.logging.Log;
028import org.apache.commons.logging.LogFactory;
029import org.nuxeo.ecm.core.api.Blob;
030import org.nuxeo.ecm.core.api.DocumentLocation;
031import org.nuxeo.ecm.core.api.DocumentModel;
032import org.nuxeo.ecm.core.api.IdRef;
033import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
035import org.nuxeo.ecm.core.api.impl.DocumentLocationImpl;
036import org.nuxeo.ecm.core.api.impl.blob.StringBlob;
037import org.nuxeo.ecm.core.convert.api.ConversionException;
038import org.nuxeo.ecm.core.convert.api.ConversionService;
039import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText;
040import org.nuxeo.ecm.core.utils.BlobsExtractor;
041import org.nuxeo.ecm.core.work.AbstractWork;
042import org.nuxeo.ecm.core.work.api.Work;
043import org.nuxeo.ecm.core.work.api.WorkManager;
044import org.nuxeo.runtime.api.Framework;
045
046/**
047 * Work task that does fulltext extraction from the blobs of the given document.
048 * <p>
049 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}.
050 * <p>
051 * This base abstract class must be subclassed in order to implement the proper
052 * {@link #initFulltextConfigurationAndParser} depending on the storage.
053 *
054 * @since 5.7
055 */
056public abstract class FulltextExtractorWork extends AbstractWork {
057
058    private static final long serialVersionUID = 1L;
059
060    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
061
062    protected static final String ANY2TEXT = "any2text";
063
064    protected static final String CATEGORY = "fulltextExtractor";
065
066    protected static final String TITLE = "fulltextExtractor";
067
068    protected final boolean excludeProxies;
069
070    protected transient FulltextConfiguration fulltextConfiguration;
071
072    protected transient FulltextParser fulltextParser;
073
074    public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) {
075        super(id);
076        setDocument(repositoryName, docId);
077        this.excludeProxies = excludeProxies;
078    }
079
080    @Override
081    public String getCategory() {
082        return CATEGORY;
083    }
084
085    @Override
086    public String getTitle() {
087        return TITLE;
088    }
089
090    @Override
091    public int getRetryCount() {
092        // even read-only threads may encounter concurrent update exceptions
093        // when trying to read a previously deleted complex property
094        // due to read committed semantics, cf NXP-17384
095        return 1;
096    }
097
098    @Override
099    public void work() {
100        openSystemSession();
101        // if the runtime has shutdown (normally because tests are finished)
102        // this can happen, see NXP-4009
103        if (session.getPrincipal() == null) {
104            return;
105        }
106
107        initFulltextConfigurationAndParser();
108
109        setStatus("Extracting");
110        setProgress(Progress.PROGRESS_0_PC);
111        extractBinaryText();
112        setProgress(Progress.PROGRESS_100_PC);
113        setStatus("Done");
114    }
115
116    /**
117     * Initializes the fulltext configuration and parser.
118     *
119     * @since 5.9.5
120     */
121    public abstract void initFulltextConfigurationAndParser();
122
123    protected void extractBinaryText() {
124        IdRef docRef = new IdRef(docId);
125        if (!session.exists(docRef)) {
126            // doc is gone
127            return;
128        }
129        DocumentModel doc = session.getDocument(docRef);
130        if (excludeProxies && doc.isProxy()) {
131            // VCS proxies don't have any fulltext attached, it's
132            // the target document that carries it
133            return;
134        }
135        if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) {
136            // excluded by config
137            return;
138        }
139
140        // Iterate on each index to set the binaryText column
141        BlobsExtractor extractor = new BlobsExtractor();
142        DocumentLocation docLocation = new DocumentLocationImpl(doc);
143        List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>();
144        for (String indexName : fulltextConfiguration.indexNames) {
145            if (!fulltextConfiguration.indexesAllBinary.contains(indexName)
146                    && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) {
147                // nothing to do: index not configured for blob
148                continue;
149            }
150            extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName),
151                    fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName),
152                    fulltextConfiguration.indexesAllBinary.contains(indexName));
153            List<Blob> blobs = extractor.getBlobs(doc);
154            StringBlob stringBlob = blobsToStringBlob(blobs, docId);
155            String text = fulltextParser.parse(stringBlob.getString(), null, stringBlob.getMimeType(), docLocation);
156            int fullTextFieldSizeLimit = fulltextConfiguration.fulltextFieldSizeLimit;
157            if (fullTextFieldSizeLimit != 0 && text.length() > fullTextFieldSizeLimit) {
158                if (log.isDebugEnabled()) {
159                    log.debug(String.format(
160                            "Fulltext extract of length: %s for indexName: %s of document: %s truncated to length: %s",
161                            text.length(), indexName, docId, fullTextFieldSizeLimit));
162                }
163                text = text.substring(0, fullTextFieldSizeLimit);
164            }
165            indexesAndText.add(new IndexAndText(indexName, text));
166        }
167        if (!indexesAndText.isEmpty()) {
168            Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText);
169            if (!fulltextConfiguration.fulltextSearchDisabled) {
170                WorkManager workManager = Framework.getLocalService(WorkManager.class);
171                workManager.schedule(work, true);
172            } else {
173                ((FulltextUpdaterWork)work).updateWithSession(session);
174            }
175        }
176
177    }
178
179    @Override
180    public void cleanUp(boolean ok, Exception e) {
181        super.cleanUp(ok, e);
182        fulltextConfiguration = null;
183        fulltextParser = null;
184    }
185
186    protected StringBlob blobsToStringBlob(List<Blob> blobs, String docId) {
187        String mimeType = null;
188        List<String> strings = new LinkedList<String>();
189        for (Blob blob : blobs) {
190            try {
191                SimpleBlobHolder bh = new SimpleBlobHolder(blob);
192                BlobHolder result = convert(bh);
193                if (result == null) {
194                    continue;
195                }
196                blob = result.getBlob();
197                if (blob == null) {
198                    continue;
199                }
200                if (StringUtils.isEmpty(mimeType) && StringUtils.isNotEmpty(blob.getMimeType())) {
201                    mimeType = blob.getMimeType();
202                }
203                String string = new String(blob.getByteArray(), "UTF-8");
204                // strip '\0 chars from text
205                if (string.indexOf('\0') >= 0) {
206                    string = string.replace("\0", " ");
207                }
208                strings.add(string);
209            } catch (ConversionException | IOException e) {
210                String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId
211                        + ": " + e;
212                log.warn(msg);
213                log.debug(msg, e);
214                continue;
215            }
216        }
217        return new StringBlob(StringUtils.join(strings, " "), mimeType);
218    }
219
220    protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException {
221        ConversionService conversionService = Framework.getLocalService(ConversionService.class);
222        if (conversionService == null) {
223            log.debug("No ConversionService available");
224            return null;
225        }
226        return conversionService.convert(ANY2TEXT, blobHolder, null);
227    }
228
229}