001/* 002 * (C) Copyright 2006-2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 * Stephane Lacoin 019 */ 020package org.nuxeo.ecm.core.storage; 021 022import java.io.IOException; 023import java.io.Serializable; 024import java.util.ArrayList; 025import java.util.Collections; 026import java.util.IdentityHashMap; 027import java.util.List; 028import java.util.Map; 029import java.util.Set; 030import java.util.stream.Collectors; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.apache.commons.text.StringEscapeUtils; 035import org.nuxeo.ecm.core.api.Blob; 036import org.nuxeo.ecm.core.api.DocumentModel; 037import org.nuxeo.ecm.core.api.DocumentRef; 038import org.nuxeo.ecm.core.api.IdRef; 039import org.nuxeo.ecm.core.api.IterableQueryResult; 040import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 041import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 042import org.nuxeo.ecm.core.api.repository.FulltextConfiguration; 043import org.nuxeo.ecm.core.convert.api.ConversionException; 044import org.nuxeo.ecm.core.convert.api.ConversionService; 045import org.nuxeo.ecm.core.model.Repository; 046import org.nuxeo.ecm.core.query.sql.NXQL; 047import org.nuxeo.ecm.core.repository.RepositoryService; 048import org.nuxeo.ecm.core.utils.BlobsExtractor; 049import org.nuxeo.ecm.core.utils.StringsExtractor; 050import org.nuxeo.ecm.core.work.AbstractWork; 051import org.nuxeo.runtime.api.Framework; 052 053import net.htmlparser.jericho.Source; 054 055/** 056 * Work task that does fulltext extraction from the string properties and the blobs of the given document, saving them 057 * into the fulltext table. 058 * 059 * @since 5.7 for the original implementation 060 * @since 10.3 the extraction and update are done in the same Work 061 */ 062public class FulltextExtractorWork extends AbstractWork { 063 064 private static final long serialVersionUID = 1L; 065 066 private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); 067 068 public static final String SYSPROP_FULLTEXT_SIMPLE = "fulltextSimple"; 069 070 public static final String SYSPROP_FULLTEXT_BINARY = "fulltextBinary"; 071 072 public static final String SYSPROP_FULLTEXT_JOBID = "fulltextJobId"; 073 074 public static final String FULLTEXT_DEFAULT_INDEX = "default"; 075 076 protected static final String CATEGORY = "fulltextExtractor"; 077 078 protected static final String TITLE = "Fulltext Extractor"; 079 080 protected static final String ANY2TEXT_CONVERTER = "any2text"; 081 082 protected static final int HTML_MAGIC_OFFSET = 8192; 083 084 protected static final String TEXT_HTML = "text/html"; 085 086 protected transient FulltextConfiguration fulltextConfiguration; 087 088 protected transient DocumentModel document; 089 090 protected transient List<DocumentRef> docsToUpdate; 091 092 /** If true, update the simple text from the document. */ 093 protected final boolean updateSimpleText; 094 095 /** If true, update the binary text from the document. */ 096 protected final boolean updateBinaryText; 097 098 protected final boolean useJobId; 099 100 public FulltextExtractorWork(String repositoryName, String docId, boolean updateSimpleText, 101 boolean updateBinaryText, boolean useJobId) { 102 super(); // random id, for unique job 103 setDocument(repositoryName, docId); 104 this.updateSimpleText = updateSimpleText; 105 this.updateBinaryText = updateBinaryText; 106 this.useJobId = useJobId; 107 } 108 109 @Override 110 public String getCategory() { 111 return CATEGORY; 112 } 113 114 @Override 115 public String getTitle() { 116 return TITLE; 117 } 118 119 @Override 120 public int getRetryCount() { 121 return 1; 122 } 123 124 @Override 125 public void work() { 126 openSystemSession(); 127 // if the runtime has shut down (normally because tests are finished) 128 // this can happen, see NXP-4009 129 if (session.getPrincipal() == null) { 130 return; 131 } 132 DocumentRef docRef = new IdRef(docId); 133 if (!session.exists(docRef)) { 134 return; 135 } 136 document = session.getDocument(docRef); 137 findDocsToUpdate(); 138 if (docsToUpdate.isEmpty()) { 139 return; 140 } 141 initFulltextConfiguration(); 142 143 setStatus("Extracting"); 144 setProgress(Progress.PROGRESS_0_PC); 145 extractAndUpdate(); 146 setStatus("Saving"); 147 session.save(); 148 setProgress(Progress.PROGRESS_100_PC); 149 setStatus("Done"); 150 } 151 152 protected void initFulltextConfiguration() { 153 RepositoryService repositoryService = Framework.getService(RepositoryService.class); 154 Repository repository = repositoryService.getRepository(repositoryName); 155 fulltextConfiguration = repository.getFulltextConfiguration(); 156 } 157 158 protected void findDocsToUpdate() { 159 if (useJobId) { 160 // find which docs will receive the extracted text (there may be more than one if the original 161 // doc was copied between the time it was saved and this listener being asynchronously executed) 162 String query = String.format( 163 "SELECT ecm:uuid FROM Document WHERE ecm:fulltextJobId = '%s' AND ecm:isProxy = 0", docId); 164 docsToUpdate = new ArrayList<>(); 165 try (IterableQueryResult it = session.queryAndFetch(query, NXQL.NXQL)) { 166 for (Map<String, Serializable> map : it) { 167 docsToUpdate.add(new IdRef((String) map.get(NXQL.ECM_UUID))); 168 } 169 } 170 } else { 171 docsToUpdate = Collections.singletonList(document.getRef()); 172 } 173 } 174 175 protected void extractAndUpdate() { 176 // update all docs 177 if (updateSimpleText) { 178 extractAndUpdateSimpleText(); 179 } 180 if (updateBinaryText) { 181 extractAndUpdateBinaryText(); 182 } 183 // reset job id 184 for (DocumentRef docRef : docsToUpdate) { 185 session.setDocumentSystemProp(docRef, SYSPROP_FULLTEXT_JOBID, null); 186 } 187 } 188 189 protected void extractAndUpdateSimpleText() { 190 if (fulltextConfiguration == null || fulltextConfiguration.fulltextSearchDisabled) { 191 // if fulltext search is disabled, we don't extract simple text at all 192 return; 193 } 194 for (String indexName : fulltextConfiguration.indexNames) { 195 if (!fulltextConfiguration.indexesAllSimple.contains(indexName) 196 && fulltextConfiguration.propPathsByIndexSimple.get(indexName) == null) { 197 // nothing to do: index not configured for simple text 198 continue; 199 } 200 Set<String> includedPaths = fulltextConfiguration.indexesAllSimple.contains(indexName) ? null 201 : fulltextConfiguration.propPathsByIndexSimple.get(indexName); 202 Set<String> excludedPaths = fulltextConfiguration.propPathsExcludedByIndexSimple.get(indexName); 203 // get string properties 204 List<String> strings = new StringsExtractor().findStrings(document, includedPaths, excludedPaths); 205 // transform to text (remove HTML and entities) 206 // we do this here rather than in the indexing backend (Elasticsearch) because it's more efficient here 207 // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" 208 String text = strings.stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " ")); 209 // limit size 210 text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); 211 String property = getFulltextPropertyName(SYSPROP_FULLTEXT_SIMPLE, indexName); 212 for (DocumentRef docRef : docsToUpdate) { 213 session.setDocumentSystemProp(docRef, property, text); 214 } 215 } 216 } 217 218 protected void extractAndUpdateBinaryText() { 219 // we extract binary text even if fulltext search is disabled, 220 // because it is still used to inject into external indexers like Elasticsearch 221 BlobsExtractor blobsExtractor = new BlobsExtractor(); 222 Map<Blob, String> blobsText = new IdentityHashMap<>(); 223 for (String indexName : fulltextConfiguration.indexNames) { 224 if (!fulltextConfiguration.indexesAllBinary.contains(indexName) 225 && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { 226 // nothing to do: index not configured for blob 227 continue; 228 } 229 // get original text from all blobs 230 blobsExtractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), 231 fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), 232 fulltextConfiguration.indexesAllBinary.contains(indexName)); 233 List<String> strings = new ArrayList<>(); 234 for (Blob blob : blobsExtractor.getBlobs(document)) { 235 String string = blobsText.computeIfAbsent(blob, this::blobToText); 236 strings.add(string); 237 } 238 // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" 239 String text = " " + String.join(" ", strings) + " "; 240 text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); 241 String property = getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName); 242 for (DocumentRef docRef : docsToUpdate) { 243 session.setDocumentSystemProp(docRef, property, text); 244 } 245 } 246 } 247 248 protected String stringToText(String string) { 249 string = removeHtml(string); 250 string = removeEntities(string); 251 return string; 252 } 253 254 protected String removeHtml(String string) { 255 // quick HTML detection on the initial part of the string 256 String initial = string.substring(0, Math.min(string.length(), HTML_MAGIC_OFFSET)).toLowerCase(); 257 if (initial.startsWith("<!doctype html") || initial.contains("<html")) { 258 // convert using Jericho HTML Parser 259 string = new Source(string).getRenderer() 260 .setIncludeHyperlinkURLs(false) 261 .setDecorateFontStyles(false) 262 .toString(); 263 } 264 return string; 265 } 266 267 protected String removeEntities(String string) { 268 if (string.indexOf('&') >= 0) { 269 string = StringEscapeUtils.unescapeHtml4(string); 270 } 271 return string; 272 } 273 274 /** 275 * Converts the blob to text by calling a converter. 276 */ 277 protected String blobToText(Blob blob) { 278 try { 279 ConversionService conversionService = Framework.getService(ConversionService.class); 280 if (conversionService == null) { 281 log.debug("No ConversionService available"); 282 return ""; 283 } 284 BlobHolder blobHolder = conversionService.convert(ANY2TEXT_CONVERTER, new SimpleBlobHolder(blob), null); 285 if (blobHolder == null) { 286 return ""; 287 } 288 Blob resultBlob = blobHolder.getBlob(); 289 if (resultBlob == null) { 290 return ""; 291 } 292 String string = resultBlob.getString(); 293 // strip '\0 chars from text 294 if (string.indexOf('\0') >= 0) { 295 string = string.replace("\0", " "); 296 } 297 return string; 298 } catch (ConversionException | IOException e) { 299 String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId + ": " 300 + e; 301 log.warn(msg); 302 log.debug(msg, e); 303 return ""; 304 } 305 } 306 307 @SuppressWarnings("boxing") 308 protected String limitStringSize(String string, int maxSize) { 309 if (maxSize != 0 && string.length() > maxSize) { 310 if (log.isDebugEnabled()) { 311 log.debug(String.format("Fulltext extract of length: %s for document: %s truncated to length: %s", 312 string.length(), docId, maxSize)); 313 } 314 string = string.substring(0, maxSize); 315 } 316 return string; 317 } 318 319 protected String getFulltextPropertyName(String name, String indexName) { 320 if (!FULLTEXT_DEFAULT_INDEX.equals(indexName)) { 321 name += '_' + indexName; 322 } 323 return name; 324 } 325 326}