001/* 002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 * Stephane Lacoin 019 */ 020package org.nuxeo.ecm.core.storage; 021 022import java.io.IOException; 023import java.util.LinkedList; 024import java.util.List; 025 026import org.apache.commons.lang.StringUtils; 027import org.apache.commons.logging.Log; 028import org.apache.commons.logging.LogFactory; 029import org.nuxeo.ecm.core.api.Blob; 030import org.nuxeo.ecm.core.api.DocumentLocation; 031import org.nuxeo.ecm.core.api.DocumentModel; 032import org.nuxeo.ecm.core.api.IdRef; 033import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 035import org.nuxeo.ecm.core.api.impl.DocumentLocationImpl; 036import org.nuxeo.ecm.core.api.impl.blob.StringBlob; 037import org.nuxeo.ecm.core.convert.api.ConversionException; 038import org.nuxeo.ecm.core.convert.api.ConversionService; 039import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; 040import org.nuxeo.ecm.core.utils.BlobsExtractor; 041import org.nuxeo.ecm.core.work.AbstractWork; 042import org.nuxeo.ecm.core.work.api.Work; 043import org.nuxeo.ecm.core.work.api.WorkManager; 044import org.nuxeo.runtime.api.Framework; 045 046/** 047 * Work task that does fulltext extraction from the blobs of the given document. 048 * <p> 049 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}. 050 * <p> 051 * This base abstract class must be subclassed in order to implement the proper 052 * {@link #initFulltextConfigurationAndParser} depending on the storage. 053 * 054 * @since 5.7 055 */ 056public abstract class FulltextExtractorWork extends AbstractWork { 057 058 private static final long serialVersionUID = 1L; 059 060 private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); 061 062 protected static final String ANY2TEXT = "any2text"; 063 064 protected static final String CATEGORY = "fulltextExtractor"; 065 066 protected static final String TITLE = "fulltextExtractor"; 067 068 protected final boolean excludeProxies; 069 070 protected transient FulltextConfiguration fulltextConfiguration; 071 072 protected transient FulltextParser fulltextParser; 073 074 public FulltextExtractorWork(String repositoryName, String docId, boolean excludeProxies) { 075 setDocument(repositoryName, docId); 076 this.excludeProxies = excludeProxies; 077 } 078 079 @Override 080 public String getCategory() { 081 return CATEGORY; 082 } 083 084 @Override 085 public String getTitle() { 086 return TITLE; 087 } 088 089 @Override 090 public int getRetryCount() { 091 // even read-only threads may encounter concurrent update exceptions 092 // when trying to read a previously deleted complex property 093 // due to read committed semantics, cf NXP-17384 094 return 1; 095 } 096 097 @Override 098 public void work() { 099 openSystemSession(); 100 // if the runtime has shutdown (normally because tests are finished) 101 // this can happen, see NXP-4009 102 if (session.getPrincipal() == null) { 103 return; 104 } 105 106 initFulltextConfigurationAndParser(); 107 108 setStatus("Extracting"); 109 setProgress(Progress.PROGRESS_0_PC); 110 extractBinaryText(); 111 setProgress(Progress.PROGRESS_100_PC); 112 setStatus("Done"); 113 } 114 115 /** 116 * Initializes the fulltext configuration and parser. 117 * 118 * @since 5.9.5 119 */ 120 public abstract void initFulltextConfigurationAndParser(); 121 122 protected void extractBinaryText() { 123 IdRef docRef = new IdRef(docId); 124 if (!session.exists(docRef)) { 125 // doc is gone 126 return; 127 } 128 DocumentModel doc = session.getDocument(docRef); 129 if (excludeProxies && doc.isProxy()) { 130 // VCS proxies don't have any fulltext attached, it's 131 // the target document that carries it 132 return; 133 } 134 if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { 135 // excluded by config 136 return; 137 } 138 139 // Iterate on each index to set the binaryText column 140 BlobsExtractor extractor = new BlobsExtractor(); 141 DocumentLocation docLocation = new DocumentLocationImpl(doc); 142 List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); 143 for (String indexName : fulltextConfiguration.indexNames) { 144 if (!fulltextConfiguration.indexesAllBinary.contains(indexName) 145 && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { 146 // nothing to do: index not configured for blob 147 continue; 148 } 149 extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), 150 fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), 151 fulltextConfiguration.indexesAllBinary.contains(indexName)); 152 List<Blob> blobs = extractor.getBlobs(doc); 153 StringBlob stringBlob = blobsToStringBlob(blobs, docId); 154 String text = fulltextParser.parse(stringBlob.getString(), null, stringBlob.getMimeType(), docLocation); 155 int fullTextFieldSizeLimit = fulltextConfiguration.fulltextFieldSizeLimit; 156 if (fullTextFieldSizeLimit != 0 && text.length() > fullTextFieldSizeLimit) { 157 if (log.isDebugEnabled()) { 158 log.debug(String.format( 159 "Fulltext extract of length: %s for indexName: %s of document: %s truncated to length: %s", 160 text.length(), indexName, docId, fullTextFieldSizeLimit)); 161 } 162 text = text.substring(0, fullTextFieldSizeLimit); 163 } 164 indexesAndText.add(new IndexAndText(indexName, text)); 165 } 166 if (!indexesAndText.isEmpty()) { 167 Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); 168 if (!fulltextConfiguration.fulltextSearchDisabled) { 169 WorkManager workManager = Framework.getService(WorkManager.class); 170 workManager.schedule(work, true); 171 } else { 172 ((FulltextUpdaterWork)work).updateWithSession(session); 173 } 174 } 175 176 } 177 178 @Override 179 public void cleanUp(boolean ok, Exception e) { 180 super.cleanUp(ok, e); 181 fulltextConfiguration = null; 182 fulltextParser = null; 183 } 184 185 protected StringBlob blobsToStringBlob(List<Blob> blobs, String docId) { 186 String mimeType = null; 187 List<String> strings = new LinkedList<String>(); 188 for (Blob blob : blobs) { 189 try { 190 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 191 BlobHolder result = convert(bh); 192 if (result == null) { 193 continue; 194 } 195 blob = result.getBlob(); 196 if (blob == null) { 197 continue; 198 } 199 if (StringUtils.isEmpty(mimeType) && StringUtils.isNotEmpty(blob.getMimeType())) { 200 mimeType = blob.getMimeType(); 201 } 202 String string = new String(blob.getByteArray(), "UTF-8"); 203 // strip '\0 chars from text 204 if (string.indexOf('\0') >= 0) { 205 string = string.replace("\0", " "); 206 } 207 strings.add(string); 208 } catch (ConversionException | IOException e) { 209 String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId 210 + ": " + e; 211 log.warn(msg); 212 log.debug(msg, e); 213 continue; 214 } 215 } 216 return new StringBlob(StringUtils.join(strings, " "), mimeType); 217 } 218 219 protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { 220 ConversionService conversionService = Framework.getService(ConversionService.class); 221 if (conversionService == null) { 222 log.debug("No ConversionService available"); 223 return null; 224 } 225 return conversionService.convert(ANY2TEXT, blobHolder, null); 226 } 227 228}