001/*
002 * (C) Copyright 2006-2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 *     Stephane Lacoin
019 */
020package org.nuxeo.ecm.core.storage;
021
022import java.io.IOException;
023import java.io.Serializable;
024import java.util.ArrayList;
025import java.util.Collections;
026import java.util.IdentityHashMap;
027import java.util.List;
028import java.util.Map;
029import java.util.Set;
030import java.util.stream.Collectors;
031
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.apache.commons.text.StringEscapeUtils;
035import org.nuxeo.ecm.core.api.Blob;
036import org.nuxeo.ecm.core.api.DocumentModel;
037import org.nuxeo.ecm.core.api.DocumentRef;
038import org.nuxeo.ecm.core.api.IdRef;
039import org.nuxeo.ecm.core.api.IterableQueryResult;
040import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
041import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
042import org.nuxeo.ecm.core.api.repository.FulltextConfiguration;
043import org.nuxeo.ecm.core.convert.api.ConversionException;
044import org.nuxeo.ecm.core.convert.api.ConversionService;
045import org.nuxeo.ecm.core.model.Repository;
046import org.nuxeo.ecm.core.query.sql.NXQL;
047import org.nuxeo.ecm.core.repository.RepositoryService;
048import org.nuxeo.ecm.core.utils.BlobsExtractor;
049import org.nuxeo.ecm.core.utils.StringsExtractor;
050import org.nuxeo.ecm.core.work.AbstractWork;
051import org.nuxeo.runtime.api.Framework;
052
053import net.htmlparser.jericho.Source;
054
055/**
056 * Work task that does fulltext extraction from the string properties and the blobs of the given document, saving them
057 * into the fulltext table.
058 *
059 * @since 5.7 for the original implementation
060 * @since 10.3 the extraction and update are done in the same Work
061 */
062public class FulltextExtractorWork extends AbstractWork {
063
064    private static final long serialVersionUID = 1L;
065
066    private static final Log log = LogFactory.getLog(FulltextExtractorWork.class);
067
068    public static final String SYSPROP_FULLTEXT_SIMPLE = "fulltextSimple";
069
070    public static final String SYSPROP_FULLTEXT_BINARY = "fulltextBinary";
071
072    public static final String SYSPROP_FULLTEXT_JOBID = "fulltextJobId";
073
074    public static final String FULLTEXT_DEFAULT_INDEX = "default";
075
076    protected static final String CATEGORY = "fulltextExtractor";
077
078    protected static final String TITLE = "Fulltext Extractor";
079
080    protected static final String ANY2TEXT_CONVERTER = "any2text";
081
082    protected static final int HTML_MAGIC_OFFSET = 8192;
083
084    protected static final String TEXT_HTML = "text/html";
085
086    protected transient FulltextConfiguration fulltextConfiguration;
087
088    protected transient DocumentModel document;
089
090    protected transient List<DocumentRef> docsToUpdate;
091
092    /** If true, update the simple text from the document. */
093    protected final boolean updateSimpleText;
094
095    /** If true, update the binary text from the document. */
096    protected final boolean updateBinaryText;
097
098    protected final boolean useJobId;
099
100    public FulltextExtractorWork(String repositoryName, String docId, boolean updateSimpleText,
101            boolean updateBinaryText, boolean useJobId) {
102        super(); // random id, for unique job
103        setDocument(repositoryName, docId);
104        this.updateSimpleText = updateSimpleText;
105        this.updateBinaryText = updateBinaryText;
106        this.useJobId = useJobId;
107    }
108
109    @Override
110    public String getCategory() {
111        return CATEGORY;
112    }
113
114    @Override
115    public String getTitle() {
116        return TITLE;
117    }
118
119    @Override
120    public int getRetryCount() {
121        return 1;
122    }
123
124    @Override
125    public void work() {
126        openSystemSession();
127        // if the runtime has shut down (normally because tests are finished)
128        // this can happen, see NXP-4009
129        if (session.getPrincipal() == null) {
130            return;
131        }
132        DocumentRef docRef = new IdRef(docId);
133        if (!session.exists(docRef)) {
134            return;
135        }
136        document = session.getDocument(docRef);
137        findDocsToUpdate();
138        if (docsToUpdate.isEmpty()) {
139            return;
140        }
141        initFulltextConfiguration();
142
143        setStatus("Extracting");
144        setProgress(Progress.PROGRESS_0_PC);
145        extractAndUpdate();
146        setStatus("Saving");
147        session.save();
148        setProgress(Progress.PROGRESS_100_PC);
149        setStatus("Done");
150    }
151
152    protected void initFulltextConfiguration() {
153        RepositoryService repositoryService = Framework.getService(RepositoryService.class);
154        Repository repository = repositoryService.getRepository(repositoryName);
155        fulltextConfiguration = repository.getFulltextConfiguration();
156    }
157
158    protected void findDocsToUpdate() {
159        if (useJobId) {
160            // find which docs will receive the extracted text (there may be more than one if the original
161            // doc was copied between the time it was saved and this listener being asynchronously executed)
162            String query = String.format(
163                    "SELECT ecm:uuid FROM Document WHERE ecm:fulltextJobId = '%s' AND ecm:isProxy = 0", docId);
164            docsToUpdate = new ArrayList<>();
165            try (IterableQueryResult it = session.queryAndFetch(query, NXQL.NXQL)) {
166                for (Map<String, Serializable> map : it) {
167                    docsToUpdate.add(new IdRef((String) map.get(NXQL.ECM_UUID)));
168                }
169            }
170        } else {
171            docsToUpdate = Collections.singletonList(document.getRef());
172        }
173    }
174
175    protected void extractAndUpdate() {
176        // update all docs
177        if (updateSimpleText) {
178            extractAndUpdateSimpleText();
179        }
180        if (updateBinaryText) {
181            extractAndUpdateBinaryText();
182        }
183        // reset job id
184        for (DocumentRef docRef : docsToUpdate) {
185            session.setDocumentSystemProp(docRef, SYSPROP_FULLTEXT_JOBID, null);
186        }
187    }
188
189    protected void extractAndUpdateSimpleText() {
190        if (fulltextConfiguration == null || fulltextConfiguration.fulltextSearchDisabled) {
191            // if fulltext search is disabled, we don't extract simple text at all
192            return;
193        }
194        for (String indexName : fulltextConfiguration.indexNames) {
195            if (!fulltextConfiguration.indexesAllSimple.contains(indexName)
196                    && fulltextConfiguration.propPathsByIndexSimple.get(indexName) == null) {
197                // nothing to do: index not configured for simple text
198                continue;
199            }
200            Set<String> includedPaths = fulltextConfiguration.indexesAllSimple.contains(indexName) ? null
201                    : fulltextConfiguration.propPathsByIndexSimple.get(indexName);
202            Set<String> excludedPaths = fulltextConfiguration.propPathsExcludedByIndexSimple.get(indexName);
203            // get string properties
204            List<String> strings = new StringsExtractor().findStrings(document, includedPaths, excludedPaths);
205            // transform to text (remove HTML and entities)
206            // we do this here rather than in the indexing backend (Elasticsearch) because it's more efficient here
207            // add space at beginning and end for simulated phrase search using LIKE "% foo bar %"
208            String text = strings.stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " "));
209            // limit size
210            text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit);
211            String property = getFulltextPropertyName(SYSPROP_FULLTEXT_SIMPLE, indexName);
212            for (DocumentRef docRef : docsToUpdate) {
213                session.setDocumentSystemProp(docRef, property, text);
214            }
215        }
216    }
217
218    protected void extractAndUpdateBinaryText() {
219        // we extract binary text even if fulltext search is disabled,
220        // because it is still used to inject into external indexers like Elasticsearch
221        BlobsExtractor blobsExtractor = new BlobsExtractor();
222        Map<Blob, String> blobsText = new IdentityHashMap<>();
223        for (String indexName : fulltextConfiguration.indexNames) {
224            if (!fulltextConfiguration.indexesAllBinary.contains(indexName)
225                    && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) {
226                // nothing to do: index not configured for blob
227                continue;
228            }
229            // get original text from all blobs
230            blobsExtractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName),
231                    fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName),
232                    fulltextConfiguration.indexesAllBinary.contains(indexName));
233            List<String> strings = new ArrayList<>();
234            for (Blob blob : blobsExtractor.getBlobs(document)) {
235                String string = blobsText.computeIfAbsent(blob, this::blobToText);
236                strings.add(string);
237            }
238            // add space at beginning and end for simulated phrase search using LIKE "% foo bar %"
239            String text = " " + String.join(" ", strings) + " ";
240            text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit);
241            String property = getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName);
242            for (DocumentRef docRef : docsToUpdate) {
243                session.setDocumentSystemProp(docRef, property, text);
244            }
245        }
246    }
247
248    protected String stringToText(String string) {
249        string = removeHtml(string);
250        string = removeEntities(string);
251        return string;
252    }
253
254    protected String removeHtml(String string) {
255        // quick HTML detection on the initial part of the string
256        String initial = string.substring(0, Math.min(string.length(), HTML_MAGIC_OFFSET)).toLowerCase();
257        if (initial.startsWith("<!doctype html") || initial.contains("<html")) {
258            // convert using Jericho HTML Parser
259            string = new Source(string).getRenderer()
260                                       .setIncludeHyperlinkURLs(false)
261                                       .setDecorateFontStyles(false)
262                                       .toString();
263        }
264        return string;
265    }
266
267    protected String removeEntities(String string) {
268        if (string.indexOf('&') >= 0) {
269            string = StringEscapeUtils.unescapeHtml4(string);
270        }
271        return string;
272    }
273
274    /**
275     * Converts the blob to text by calling a converter.
276     */
277    protected String blobToText(Blob blob) {
278        try {
279            ConversionService conversionService = Framework.getService(ConversionService.class);
280            if (conversionService == null) {
281                log.debug("No ConversionService available");
282                return "";
283            }
284            BlobHolder blobHolder = conversionService.convert(ANY2TEXT_CONVERTER, new SimpleBlobHolder(blob), null);
285            if (blobHolder == null) {
286                return "";
287            }
288            Blob resultBlob = blobHolder.getBlob();
289            if (resultBlob == null) {
290                return "";
291            }
292            String string = resultBlob.getString();
293            // strip '\0 chars from text
294            if (string.indexOf('\0') >= 0) {
295                string = string.replace("\0", " ");
296            }
297            return string;
298        } catch (ConversionException | IOException e) {
299            String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId + ": "
300                    + e;
301            log.warn(msg);
302            log.debug(msg, e);
303            return "";
304        }
305    }
306
307    @SuppressWarnings("boxing")
308    protected String limitStringSize(String string, int maxSize) {
309        if (maxSize != 0 && string.length() > maxSize) {
310            if (log.isDebugEnabled()) {
311                log.debug(String.format("Fulltext extract of length: %s for document: %s truncated to length: %s",
312                        string.length(), docId, maxSize));
313            }
314            string = string.substring(0, maxSize);
315        }
316        return string;
317    }
318
319    protected String getFulltextPropertyName(String name, String indexName) {
320        if (!FULLTEXT_DEFAULT_INDEX.equals(indexName)) {
321            name += '_' + indexName;
322        }
323        return name;
324    }
325
326}