001/* 002 * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 * Stephane Lacoin 019 */ 020package org.nuxeo.ecm.core.storage; 021 022import java.io.IOException; 023import java.util.LinkedList; 024import java.util.List; 025 026import org.apache.commons.lang.StringUtils; 027import org.apache.commons.logging.Log; 028import org.apache.commons.logging.LogFactory; 029import org.nuxeo.ecm.core.api.Blob; 030import org.nuxeo.ecm.core.api.DocumentModel; 031import org.nuxeo.ecm.core.api.IdRef; 032import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 033import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 034import org.nuxeo.ecm.core.convert.api.ConversionException; 035import org.nuxeo.ecm.core.convert.api.ConversionService; 036import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; 037import org.nuxeo.ecm.core.utils.BlobsExtractor; 038import org.nuxeo.ecm.core.work.AbstractWork; 039import org.nuxeo.ecm.core.work.api.Work; 040import org.nuxeo.ecm.core.work.api.WorkManager; 041import org.nuxeo.runtime.api.Framework; 042 043/** 044 * Work task that does fulltext extraction from the blobs of the given document. 045 * <p> 046 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}. 047 * <p> 048 * This base abstract class must be subclassed in order to implement the proper 049 * {@link #initFulltextConfigurationAndParser} depending on the storage. 050 * 051 * @since 5.7 052 */ 053public abstract class FulltextExtractorWork extends AbstractWork { 054 055 private static final long serialVersionUID = 1L; 056 057 private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); 058 059 protected static final String ANY2TEXT = "any2text"; 060 061 protected static final String CATEGORY = "fulltextExtractor"; 062 063 protected static final String TITLE = "fulltextExtractor"; 064 065 protected final boolean excludeProxies; 066 067 protected transient FulltextConfiguration fulltextConfiguration; 068 069 protected transient FulltextParser fulltextParser; 070 071 public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) { 072 super(id); 073 setDocument(repositoryName, docId); 074 this.excludeProxies = excludeProxies; 075 } 076 077 @Override 078 public String getCategory() { 079 return CATEGORY; 080 } 081 082 @Override 083 public String getTitle() { 084 return TITLE; 085 } 086 087 @Override 088 public int getRetryCount() { 089 // even read-only threads may encounter concurrent update exceptions 090 // when trying to read a previously deleted complex property 091 // due to read committed semantics, cf NXP-17384 092 return 1; 093 } 094 095 @Override 096 public void work() { 097 openSystemSession(); 098 // if the runtime has shutdown (normally because tests are finished) 099 // this can happen, see NXP-4009 100 if (session.getPrincipal() == null) { 101 return; 102 } 103 104 initFulltextConfigurationAndParser(); 105 106 setStatus("Extracting"); 107 setProgress(Progress.PROGRESS_0_PC); 108 extractBinaryText(); 109 setProgress(Progress.PROGRESS_100_PC); 110 setStatus("Done"); 111 } 112 113 /** 114 * Initializes the fulltext configuration and parser. 115 * 116 * @since 5.9.5 117 */ 118 public abstract void initFulltextConfigurationAndParser(); 119 120 protected void extractBinaryText() { 121 IdRef docRef = new IdRef(docId); 122 if (!session.exists(docRef)) { 123 // doc is gone 124 return; 125 } 126 DocumentModel doc = session.getDocument(docRef); 127 if (excludeProxies && doc.isProxy()) { 128 // VCS proxies don't have any fulltext attached, it's 129 // the target document that carries it 130 return; 131 } 132 if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { 133 // excluded by config 134 return; 135 } 136 137 // Iterate on each index to set the binaryText column 138 BlobsExtractor extractor = new BlobsExtractor(); 139 List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); 140 for (String indexName : fulltextConfiguration.indexNames) { 141 if (!fulltextConfiguration.indexesAllBinary.contains(indexName) 142 && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { 143 // nothing to do: index not configured for blob 144 continue; 145 } 146 extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), 147 fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), 148 fulltextConfiguration.indexesAllBinary.contains(indexName)); 149 List<Blob> blobs = extractor.getBlobs(doc); 150 String text = blobsToText(blobs, docId); 151 text = fulltextParser.parse(text, null); 152 indexesAndText.add(new IndexAndText(indexName, text)); 153 } 154 if (!indexesAndText.isEmpty()) { 155 Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); 156 WorkManager workManager = Framework.getLocalService(WorkManager.class); 157 workManager.schedule(work, true); 158 } 159 } 160 161 @Override 162 public void cleanUp(boolean ok, Exception e) { 163 super.cleanUp(ok, e); 164 fulltextConfiguration = null; 165 fulltextParser = null; 166 } 167 168 protected String blobsToText(List<Blob> blobs, String docId) { 169 List<String> strings = new LinkedList<String>(); 170 for (Blob blob : blobs) { 171 try { 172 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 173 BlobHolder result = convert(bh); 174 if (result == null) { 175 continue; 176 } 177 blob = result.getBlob(); 178 if (blob == null) { 179 continue; 180 } 181 String string = new String(blob.getByteArray(), "UTF-8"); 182 // strip '\0 chars from text 183 if (string.indexOf('\0') >= 0) { 184 string = string.replace("\0", " "); 185 } 186 strings.add(string); 187 } catch (ConversionException | IOException e) { 188 String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId 189 + ": " + e; 190 log.warn(msg); 191 log.debug(msg, e); 192 continue; 193 } 194 } 195 return StringUtils.join(strings, " "); 196 } 197 198 protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { 199 ConversionService conversionService = Framework.getLocalService(ConversionService.class); 200 if (conversionService == null) { 201 log.debug("No ConversionService available"); 202 return null; 203 } 204 return conversionService.convert(ANY2TEXT, blobHolder, null); 205 } 206 207}