001/* 002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 * Stephane Lacoin 019 */ 020package org.nuxeo.ecm.core.storage; 021 022import java.io.IOException; 023import java.util.LinkedList; 024import java.util.List; 025 026import org.apache.commons.lang.StringUtils; 027import org.apache.commons.logging.Log; 028import org.apache.commons.logging.LogFactory; 029import org.nuxeo.ecm.core.api.Blob; 030import org.nuxeo.ecm.core.api.DocumentLocation; 031import org.nuxeo.ecm.core.api.DocumentModel; 032import org.nuxeo.ecm.core.api.IdRef; 033import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 034import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 035import org.nuxeo.ecm.core.api.impl.DocumentLocationImpl; 036import org.nuxeo.ecm.core.api.impl.blob.StringBlob; 037import org.nuxeo.ecm.core.convert.api.ConversionException; 038import org.nuxeo.ecm.core.convert.api.ConversionService; 039import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; 040import org.nuxeo.ecm.core.utils.BlobsExtractor; 041import org.nuxeo.ecm.core.work.AbstractWork; 042import org.nuxeo.ecm.core.work.api.Work; 043import org.nuxeo.ecm.core.work.api.WorkManager; 044import org.nuxeo.runtime.api.Framework; 045 046/** 047 * Work task that does fulltext extraction from the blobs of the given document. 048 * <p> 049 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}. 050 * <p> 051 * This base abstract class must be subclassed in order to implement the proper 052 * {@link #initFulltextConfigurationAndParser} depending on the storage. 053 * 054 * @since 5.7 055 */ 056public abstract class FulltextExtractorWork extends AbstractWork { 057 058 private static final long serialVersionUID = 1L; 059 060 private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); 061 062 protected static final String ANY2TEXT = "any2text"; 063 064 protected static final String CATEGORY = "fulltextExtractor"; 065 066 protected static final String TITLE = "fulltextExtractor"; 067 068 protected final boolean excludeProxies; 069 070 protected transient FulltextConfiguration fulltextConfiguration; 071 072 protected transient FulltextParser fulltextParser; 073 074 public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) { 075 super(id); 076 setDocument(repositoryName, docId); 077 this.excludeProxies = excludeProxies; 078 } 079 080 @Override 081 public String getCategory() { 082 return CATEGORY; 083 } 084 085 @Override 086 public String getTitle() { 087 return TITLE; 088 } 089 090 @Override 091 public int getRetryCount() { 092 // even read-only threads may encounter concurrent update exceptions 093 // when trying to read a previously deleted complex property 094 // due to read committed semantics, cf NXP-17384 095 return 1; 096 } 097 098 @Override 099 public void work() { 100 openSystemSession(); 101 // if the runtime has shutdown (normally because tests are finished) 102 // this can happen, see NXP-4009 103 if (session.getPrincipal() == null) { 104 return; 105 } 106 107 initFulltextConfigurationAndParser(); 108 109 setStatus("Extracting"); 110 setProgress(Progress.PROGRESS_0_PC); 111 extractBinaryText(); 112 setProgress(Progress.PROGRESS_100_PC); 113 setStatus("Done"); 114 } 115 116 /** 117 * Initializes the fulltext configuration and parser. 118 * 119 * @since 5.9.5 120 */ 121 public abstract void initFulltextConfigurationAndParser(); 122 123 protected void extractBinaryText() { 124 IdRef docRef = new IdRef(docId); 125 if (!session.exists(docRef)) { 126 // doc is gone 127 return; 128 } 129 DocumentModel doc = session.getDocument(docRef); 130 if (excludeProxies && doc.isProxy()) { 131 // VCS proxies don't have any fulltext attached, it's 132 // the target document that carries it 133 return; 134 } 135 if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { 136 // excluded by config 137 return; 138 } 139 140 // Iterate on each index to set the binaryText column 141 BlobsExtractor extractor = new BlobsExtractor(); 142 DocumentLocation docLocation = new DocumentLocationImpl(doc); 143 List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); 144 for (String indexName : fulltextConfiguration.indexNames) { 145 if (!fulltextConfiguration.indexesAllBinary.contains(indexName) 146 && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { 147 // nothing to do: index not configured for blob 148 continue; 149 } 150 extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), 151 fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), 152 fulltextConfiguration.indexesAllBinary.contains(indexName)); 153 List<Blob> blobs = extractor.getBlobs(doc); 154 StringBlob stringBlob = blobsToStringBlob(blobs, docId); 155 String text = fulltextParser.parse(stringBlob.getString(), null, stringBlob.getMimeType(), docLocation); 156 int fullTextFieldSizeLimit = fulltextConfiguration.fulltextFieldSizeLimit; 157 if (fullTextFieldSizeLimit != 0 && text.length() > fullTextFieldSizeLimit) { 158 if (log.isDebugEnabled()) { 159 log.debug(String.format( 160 "Fulltext extract of length: %s for indexName: %s of document: %s truncated to length: %s", 161 text.length(), indexName, docId, fullTextFieldSizeLimit)); 162 } 163 text = text.substring(0, fullTextFieldSizeLimit); 164 } 165 indexesAndText.add(new IndexAndText(indexName, text)); 166 } 167 if (!indexesAndText.isEmpty()) { 168 Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); 169 if (!fulltextConfiguration.fulltextSearchDisabled) { 170 WorkManager workManager = Framework.getLocalService(WorkManager.class); 171 workManager.schedule(work, true); 172 } else { 173 ((FulltextUpdaterWork)work).updateWithSession(session); 174 } 175 } 176 177 } 178 179 @Override 180 public void cleanUp(boolean ok, Exception e) { 181 super.cleanUp(ok, e); 182 fulltextConfiguration = null; 183 fulltextParser = null; 184 } 185 186 protected StringBlob blobsToStringBlob(List<Blob> blobs, String docId) { 187 String mimeType = null; 188 List<String> strings = new LinkedList<String>(); 189 for (Blob blob : blobs) { 190 try { 191 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 192 BlobHolder result = convert(bh); 193 if (result == null) { 194 continue; 195 } 196 blob = result.getBlob(); 197 if (blob == null) { 198 continue; 199 } 200 if (StringUtils.isEmpty(mimeType) && StringUtils.isNotEmpty(blob.getMimeType())) { 201 mimeType = blob.getMimeType(); 202 } 203 String string = new String(blob.getByteArray(), "UTF-8"); 204 // strip '\0 chars from text 205 if (string.indexOf('\0') >= 0) { 206 string = string.replace("\0", " "); 207 } 208 strings.add(string); 209 } catch (ConversionException | IOException e) { 210 String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId 211 + ": " + e; 212 log.warn(msg); 213 log.debug(msg, e); 214 continue; 215 } 216 } 217 return new StringBlob(StringUtils.join(strings, " "), mimeType); 218 } 219 220 protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { 221 ConversionService conversionService = Framework.getLocalService(ConversionService.class); 222 if (conversionService == null) { 223 log.debug("No ConversionService available"); 224 return null; 225 } 226 return conversionService.convert(ANY2TEXT, blobHolder, null); 227 } 228 229}