001/* 002 * Copyright (c) 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the Eclipse Public License v1.0 006 * which accompanies this distribution, and is available at 007 * http://www.eclipse.org/legal/epl-v10.html 008 * 009 * Contributors: 010 * Florent Guillaume 011 * Stephane Lacoin 012 */ 013package org.nuxeo.ecm.core.storage; 014 015import java.io.IOException; 016import java.util.LinkedList; 017import java.util.List; 018 019import org.apache.commons.lang.StringUtils; 020import org.apache.commons.logging.Log; 021import org.apache.commons.logging.LogFactory; 022import org.nuxeo.ecm.core.api.Blob; 023import org.nuxeo.ecm.core.api.DocumentModel; 024import org.nuxeo.ecm.core.api.IdRef; 025import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 026import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 027import org.nuxeo.ecm.core.convert.api.ConversionException; 028import org.nuxeo.ecm.core.convert.api.ConversionService; 029import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; 030import org.nuxeo.ecm.core.utils.BlobsExtractor; 031import org.nuxeo.ecm.core.work.AbstractWork; 032import org.nuxeo.ecm.core.work.api.Work; 033import org.nuxeo.ecm.core.work.api.WorkManager; 034import org.nuxeo.runtime.api.Framework; 035 036/** 037 * Work task that does fulltext extraction from the blobs of the given document. 038 * <p> 039 * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}. 040 * <p> 041 * This base abstract class must be subclassed in order to implement the proper 042 * {@link #initFulltextConfigurationAndParser} depending on the storage. 043 * 044 * @since 5.7 045 */ 046public abstract class FulltextExtractorWork extends AbstractWork { 047 048 private static final long serialVersionUID = 1L; 049 050 private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); 051 052 protected static final String ANY2TEXT = "any2text"; 053 054 protected static final String CATEGORY = "fulltextExtractor"; 055 056 protected static final String TITLE = "fulltextExtractor"; 057 058 protected final boolean excludeProxies; 059 060 protected transient FulltextConfiguration fulltextConfiguration; 061 062 protected transient FulltextParser fulltextParser; 063 064 public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) { 065 super(id); 066 setDocument(repositoryName, docId); 067 this.excludeProxies = excludeProxies; 068 } 069 070 @Override 071 public String getCategory() { 072 return CATEGORY; 073 } 074 075 @Override 076 public String getTitle() { 077 return TITLE; 078 } 079 080 @Override 081 public int getRetryCount() { 082 // even read-only threads may encounter concurrent update exceptions 083 // when trying to read a previously deleted complex property 084 // due to read committed semantics, cf NXP-17384 085 return 1; 086 } 087 088 @Override 089 public void work() { 090 initSession(); 091 // if the runtime has shutdown (normally because tests are finished) 092 // this can happen, see NXP-4009 093 if (session.getPrincipal() == null) { 094 return; 095 } 096 097 initFulltextConfigurationAndParser(); 098 099 setStatus("Extracting"); 100 setProgress(Progress.PROGRESS_0_PC); 101 extractBinaryText(); 102 setProgress(Progress.PROGRESS_100_PC); 103 setStatus("Done"); 104 } 105 106 /** 107 * Initializes the fulltext configuration and parser. 108 * 109 * @since 5.9.5 110 */ 111 public abstract void initFulltextConfigurationAndParser(); 112 113 protected void extractBinaryText() { 114 IdRef docRef = new IdRef(docId); 115 if (!session.exists(docRef)) { 116 // doc is gone 117 return; 118 } 119 DocumentModel doc = session.getDocument(docRef); 120 if (excludeProxies && doc.isProxy()) { 121 // VCS proxies don't have any fulltext attached, it's 122 // the target document that carries it 123 return; 124 } 125 if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { 126 // excluded by config 127 return; 128 } 129 130 // Iterate on each index to set the binaryText column 131 BlobsExtractor extractor = new BlobsExtractor(); 132 List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); 133 for (String indexName : fulltextConfiguration.indexNames) { 134 if (!fulltextConfiguration.indexesAllBinary.contains(indexName) 135 && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { 136 // nothing to do: index not configured for blob 137 continue; 138 } 139 extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), 140 fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), 141 fulltextConfiguration.indexesAllBinary.contains(indexName)); 142 List<Blob> blobs = extractor.getBlobs(doc); 143 String text = blobsToText(blobs, docId); 144 text = fulltextParser.parse(text, null); 145 indexesAndText.add(new IndexAndText(indexName, text)); 146 } 147 if (!indexesAndText.isEmpty()) { 148 Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); 149 WorkManager workManager = Framework.getLocalService(WorkManager.class); 150 workManager.schedule(work, true); 151 } 152 } 153 154 @Override 155 public void cleanUp(boolean ok, Exception e) { 156 super.cleanUp(ok, e); 157 fulltextConfiguration = null; 158 fulltextParser = null; 159 } 160 161 protected String blobsToText(List<Blob> blobs, String docId) { 162 List<String> strings = new LinkedList<String>(); 163 for (Blob blob : blobs) { 164 try { 165 SimpleBlobHolder bh = new SimpleBlobHolder(blob); 166 BlobHolder result = convert(bh); 167 if (result == null) { 168 continue; 169 } 170 blob = result.getBlob(); 171 if (blob == null) { 172 continue; 173 } 174 String string = new String(blob.getByteArray(), "UTF-8"); 175 // strip '\0 chars from text 176 if (string.indexOf('\0') >= 0) { 177 string = string.replace("\0", " "); 178 } 179 strings.add(string); 180 } catch (ConversionException | IOException e) { 181 String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId 182 + ": " + e; 183 log.warn(msg); 184 log.debug(msg, e); 185 continue; 186 } 187 } 188 return StringUtils.join(strings, " "); 189 } 190 191 protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { 192 ConversionService conversionService = Framework.getLocalService(ConversionService.class); 193 if (conversionService == null) { 194 log.debug("No ConversionService available"); 195 return null; 196 } 197 return conversionService.convert(ANY2TEXT, blobHolder, null); 198 } 199 200}