001/*
002 * (C) Copyright 2006-2011 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.utils;
020
021import java.util.ArrayList;
022import java.util.Arrays;
023import java.util.HashMap;
024import java.util.List;
025import java.util.Map;
026import java.util.Set;
027
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.nuxeo.ecm.core.api.Blob;
031import org.nuxeo.ecm.core.api.DocumentModel;
032import org.nuxeo.ecm.core.api.model.Property;
033import org.nuxeo.ecm.core.schema.DocumentType;
034import org.nuxeo.ecm.core.schema.SchemaManager;
035import org.nuxeo.ecm.core.schema.TypeConstants;
036import org.nuxeo.ecm.core.schema.types.ComplexType;
037import org.nuxeo.ecm.core.schema.types.Field;
038import org.nuxeo.ecm.core.schema.types.ListType;
039import org.nuxeo.ecm.core.schema.types.Schema;
040import org.nuxeo.ecm.core.schema.types.Type;
041import org.nuxeo.runtime.api.Framework;
042
043/**
044 * Extractor for all the blobs of a document.
045 *
046 * @author Florent Guillaume
047 * @author Benjamin Jalon
048 */
049public class BlobsExtractor {
050
051    protected static final Log log = LogFactory.getLog(BlobsExtractor.class);
052
053    protected final Map<String, Map<String, List<String>>> blobFieldPaths = new HashMap<String, Map<String, List<String>>>();
054
055    protected List<String> docTypeCached = new ArrayList<String>();
056
057    protected SchemaManager schemaManager;
058
059    private Set<String> pathProperties;
060
061    private Set<String> excludedPathProperties;
062
063    private boolean indexAllBinary = false;
064
065    private boolean isDefaultConfiguration = true;
066
067    protected SchemaManager getSchemaManager() {
068        if (schemaManager == null) {
069            schemaManager = Framework.getService(SchemaManager.class);
070        }
071        return schemaManager;
072    }
073
074    /**
075     * Get properties of the given document that contain a blob value. This method uses the cache engine to find these
076     * properties.
077     */
078    public List<Property> getBlobsProperties(DocumentModel doc) {
079
080        List<Property> result = new ArrayList<Property>();
081        for (String schema : getBlobFieldPathForDocumentType(doc.getType()).keySet()) {
082            List<String> pathsList = getBlobFieldPathForDocumentType(doc.getType()).get(schema);
083            for (String path : pathsList) {
084                if (!isInterestingBlobProperty(path, schemaManager.getSchema(schema).getNamespace().prefix)) {
085                    continue;
086                }
087                List<String> pathSplitted = Arrays.asList(path.split("/[*]/"));
088                if (pathSplitted.size() == 0) {
089                    throw new IllegalStateException("Path detected not wellformed: " + pathsList);
090                }
091                Property prop = doc.getProperty(schema + ":" + pathSplitted.get(0));
092
093                if (pathSplitted.size() >= 1) {
094                    List<String> subPath = pathSplitted.subList(1, pathSplitted.size());
095                    getBlobValue(prop, subPath, path, result);
096                }
097            }
098        }
099
100        return result;
101    }
102
103    /**
104     * Get path list of properties that may contain a blob for the given document type.
105     *
106     * @param documentType document type name
107     * @return return the property names that contain blob
108     */
109    public Map<String, List<String>> getBlobFieldPathForDocumentType(String documentType) {
110        DocumentType docType = getSchemaManager().getDocumentType(documentType);
111
112        if (!docTypeCached.contains(documentType)) {
113            Map<String, List<String>> paths = new HashMap<String, List<String>>();
114            blobFieldPaths.put(docType.getName(), paths);
115
116            createCacheForDocumentType(docType);
117        }
118
119        return blobFieldPaths.get(documentType);
120    }
121
122    public void invalidateDocumentTypeCache(String docType) {
123        if (docTypeCached.contains(docType)) {
124            docTypeCached.remove(docType);
125        }
126    }
127
128    public void invalidateCache() {
129        docTypeCached = new ArrayList<String>();
130    }
131
132    protected void createCacheForDocumentType(DocumentType docType) {
133
134        for (Schema schema : docType.getSchemas()) {
135            findInteresting(docType, schema, "", schema);
136        }
137
138        if (!docTypeCached.contains(docType.getName())) {
139            docTypeCached.add(docType.getName());
140        }
141    }
142
143    /**
144     * Analyzes the document's schemas to find which fields and complex types contain blobs. For each blob fields type
145     * found, {@link BlobsExtractor#blobMatched(DocumentType, Schema, String, Field)} is called and for each property
146     * that contains a subProperty containing a Blob,
147     * {@link BlobsExtractor#containsBlob(DocumentType, Schema, String, Field)} is called
148     *
149     * @param schema The parent schema that contains the field
150     * @param ct Current type parsed
151     * @return {@code true} if the passed complex type contains at least one blob field
152     */
153    protected boolean findInteresting(DocumentType docType, Schema schema, String path, ComplexType ct) {
154        boolean interesting = false;
155        for (Field field : ct.getFields()) {
156            Type type = field.getType();
157            if (type.isSimpleType()) {
158                continue; // not binary text
159            } else if (type.isListType()) {
160                Type ftype = ((ListType) type).getField().getType();
161                if (ftype.isComplexType()) {
162                    String blobMatchedPath = path + String.format("/%s/*", field.getName().getLocalName());
163                    if (findInteresting(docType, schema, blobMatchedPath, (ComplexType) ftype)) {
164                        containsBlob(docType, schema, blobMatchedPath, field);
165                        interesting |= true;
166                    }
167                } else {
168                    continue; // not binary text
169                }
170            } else { // complex type
171                ComplexType ctype = (ComplexType) type;
172                if (type.getName().equals(TypeConstants.CONTENT)) {
173                    // CB: Fix for NXP-3847 - do not accumulate field name in
174                    // the path
175                    String blobMatchedPath = path + String.format("/%s", field.getName().getLocalName());
176                    blobMatched(docType, schema, blobMatchedPath, field);
177                    interesting = true;
178                } else {
179                    String blobMatchedPath = path + String.format("/%s", field.getName().getLocalName());
180                    interesting |= findInteresting(docType, schema, blobMatchedPath, ctype);
181                }
182            }
183        }
184        if (interesting) {
185            containsBlob(docType, schema, path, null);
186        }
187        return interesting;
188    }
189
190    /**
191     * Call during the parsing of the schema structure in {@link BlobsExtractor#findInteresting} if field is a Blob
192     * Type. This method stores the path to that Field.
193     *
194     * @param schema The parent schema that contains the field
195     * @param field Field that is a BlobType
196     */
197    protected void blobMatched(DocumentType docType, Schema schema, String path, Field field) {
198        Map<String, List<String>> blobPathsForDocType = blobFieldPaths.get(docType.getName());
199        List<String> pathsList = blobPathsForDocType.get(schema.getName());
200        if (pathsList == null) {
201            pathsList = new ArrayList<String>();
202            blobPathsForDocType.put(schema.getName(), pathsList);
203            blobFieldPaths.put(docType.getName(), blobPathsForDocType);
204        }
205        pathsList.add(path);
206    }
207
208    /**
209     * Called during the parsing of the schema structure in {@link BlobsExtractor#findInteresting} if field contains a
210     * subfield of type Blob. This method does nothing.
211     *
212     * @param schema The parent schema that contains the field
213     * @param field Field that contains a subField of type BlobType
214     */
215    protected void containsBlob(DocumentType docType, Schema schema, String path, Field field) {
216    }
217
218    protected void getBlobValue(Property prop, List<String> subPath, String completePath, List<Property> result) {
219        if (subPath.size() == 0) {
220            if (!(prop.getValue() instanceof Blob)) {
221                log.debug("Path Field not contains a blob value: " + completePath);
222                return;
223            }
224            result.add(prop);
225            return;
226        }
227
228        for (Property childProp : prop.getChildren()) {
229            if ("/*".equals(subPath.get(0))) {
230                log.debug("TODO : BLOB IN A LIST NOT IMPLEMENTED for this path " + completePath);
231            }
232            Property childSubProp = childProp.get(subPath.get(0));
233            getBlobValue(childSubProp, subPath.subList(1, subPath.size()), completePath, result);
234        }
235    }
236
237    /**
238     * Finds all the blobs of the document.
239     * <p>
240     * This method is not thread-safe.
241     *
242     * @param doc the document
243     * @return the list of blobs in the document
244     */
245    public List<Blob> getBlobs(DocumentModel doc) {
246        List<Blob> result = new ArrayList<Blob>();
247        for (Property blobField : getBlobsProperties(doc)) {
248            Blob blob = (Blob) blobField.getValue();
249            result.add(blob);
250        }
251        return result;
252    }
253
254    public void setExtractorProperties(Set<String> pathProps, Set<String> excludedPathProps, boolean indexBlobs) {
255        pathProperties = pathProps;
256        excludedPathProperties = excludedPathProps;
257        indexAllBinary = indexBlobs;
258        isDefaultConfiguration = (pathProps == null && excludedPathProps == null && Boolean.TRUE.equals(indexBlobs));
259    }
260
261    private boolean isInterestingBlobProperty(String path, String prefix) {
262        if (isDefaultConfiguration) {
263            return true;
264        } else if (pathProperties != null && matchProperty(prefix, path, pathProperties)) {
265            return true;
266        } else if (excludedPathProperties != null && matchProperty(prefix, path, excludedPathProperties)) {
267            return false;
268        } else if (Boolean.TRUE.equals(indexAllBinary)) {
269            return true;
270        }
271        return false;
272    }
273
274    private boolean matchProperty(String prefix, String fieldPath, Set<String> propPaths) {
275        if (!prefix.equals("")) {
276            prefix += ":";
277        }
278        String pathToMatch = prefix + fieldPath.substring(1);
279        for (String propPath : propPaths) {
280            if (propPath.startsWith(pathToMatch)) {
281                return true;
282            }
283        }
284        return false;
285    }
286}