001/*
002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Benjamin Jalon
018 *     Florent Guillaume
019 */
020package org.nuxeo.ecm.core.utils;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Map;
027import java.util.Set;
028import java.util.concurrent.ConcurrentHashMap;
029
030import org.nuxeo.ecm.core.api.Blob;
031import org.nuxeo.ecm.core.api.DocumentModel;
032import org.nuxeo.ecm.core.api.model.Property;
033import org.nuxeo.ecm.core.schema.DocumentType;
034import org.nuxeo.ecm.core.schema.SchemaManager;
035import org.nuxeo.ecm.core.schema.TypeConstants;
036import org.nuxeo.ecm.core.schema.types.ComplexType;
037import org.nuxeo.ecm.core.schema.types.Field;
038import org.nuxeo.ecm.core.schema.types.ListType;
039import org.nuxeo.ecm.core.schema.types.Schema;
040import org.nuxeo.ecm.core.schema.types.Type;
041import org.nuxeo.runtime.api.Framework;
042
043/**
044 * Extractor for all the blobs of a document.
045 */
046public class BlobsExtractor {
047
048    protected final Map<String, List<String>> docBlobPaths = new ConcurrentHashMap<>();
049
050    private Set<String> includedPaths;
051
052    private Set<String> excludedPaths;
053
054    private boolean allBlobs;
055
056    private boolean isDefaultConfiguration = true;
057
058    /**
059     * Sets extractor properties, controlling what properties or values are returned by {@link #getBlobsProperties} or
060     * {@link #getBlobs}.
061     * <p>
062     * The properties have to be defined without prefix if there is no prefix in the schema definition. For blob
063     * properties, the path must include the {@code /data} part.
064     */
065    public void setExtractorProperties(Set<String> includedPaths, Set<String> excludedPaths, boolean allBlobs) {
066        this.includedPaths = normalizePaths(includedPaths);
067        this.excludedPaths = normalizePaths(excludedPaths);
068        this.allBlobs = allBlobs;
069        isDefaultConfiguration = includedPaths == null && excludedPaths == null && allBlobs;
070    }
071
072    protected boolean isInterestingPath(String path) {
073        if (isDefaultConfiguration) {
074            return true;
075        } else if (excludedPaths != null && excludedPaths.contains(path)) {
076            return false;
077        } else if (includedPaths != null && includedPaths.contains(path)) {
078            return true;
079        } else if (allBlobs) {
080            return true;
081        }
082        return false;
083    }
084
085    /**
086     * Removes the "/data" suffix used by FulltextConfiguration.
087     * <p>
088     * Adds missing schema name as prefix if no prefix ("content" -> "file:content").
089     */
090    protected Set<String> normalizePaths(Set<String> paths) {
091        if (paths == null) {
092            return null;
093        }
094        SchemaManager schemaManager = Framework.getService(SchemaManager.class);
095        Set<String> normPaths = new HashSet<>();
096        for (String path : paths) {
097            // remove "/data" suffix
098            if (path.endsWith("/data")) {
099                path = path.substring(0, path.length() - "/data".length());
100            }
101            // add schema if no schema prefix
102            if (schemaManager.getField(path) == null && !path.contains(":")) {
103                // check without prefix
104                // TODO precompute this in SchemaManagerImpl
105                int slash = path.indexOf('/');
106                String first = slash == -1 ? path : path.substring(0, slash);
107                for (Schema schema : schemaManager.getSchemas()) {
108                    if (!schema.getNamespace().hasPrefix()) {
109                        // schema without prefix, try it
110                        if (schema.getField(first) != null) {
111                            path = schema.getName() + ":" + path;
112                            break;
113                        }
114                    }
115                }
116            }
117            normPaths.add(path);
118        }
119        return normPaths;
120    }
121
122    /**
123     * Gets the blobs of the document.
124     *
125     * @param doc the document
126     * @return the list of blobs
127     */
128    public List<Blob> getBlobs(DocumentModel doc) {
129        List<Blob> blobs = new ArrayList<>();
130        for (Property property : getBlobsProperties(doc)) {
131            blobs.add((Blob) property.getValue());
132        }
133        return blobs;
134    }
135
136    /**
137     * Gets the blob properties of the document.
138     *
139     * @param doc the document
140     * @return the list of blob properties
141     */
142    public List<Property> getBlobsProperties(DocumentModel doc) {
143        List<Property> properties = new ArrayList<>();
144        for (String path : getBlobPaths(doc.getDocumentType())) {
145            if (!isInterestingPath(path)) {
146                continue;
147            }
148            List<String> split = Arrays.asList(path.split("/[*]/"));
149            if (split.isEmpty()) {
150                throw new IllegalStateException("Path detected not well-formed: " + path);
151            }
152            Property property = doc.getProperty(split.get(0));
153            List<String> subPath = split.subList(1, split.size());
154            findBlobsProperties(property, subPath, properties);
155        }
156        return properties;
157    }
158
159    /**
160     * Gets the blob paths of the document type. Extractor properties are ignored.
161     *
162     * @param documentType the document type
163     * @return the list of blob paths
164     *
165     * @since 8.3
166     */
167    public List<String> getBlobPaths(DocumentType documentType) {
168        String docType = documentType.getName();
169        List<String> paths = docBlobPaths.get(docType);
170        if (paths == null) {
171            paths = new ArrayList<>();
172            for (Schema schema : documentType.getSchemas()) {
173                findBlobPaths(schema, null, schema, paths);
174            }
175            docBlobPaths.put(docType, paths);
176        }
177        return paths;
178    }
179
180    protected void findBlobsProperties(Property property, List<String> split, List<Property> properties) {
181        if (split.isEmpty()) {
182            if (property.getValue() != null) {
183                properties.add(property);
184            }
185        } else {
186            for (Property childProperty : property.getChildren()) {
187                Property childSubProp = childProperty.get(split.get(0));
188                List<String> subPath = split.subList(1, split.size());
189                findBlobsProperties(childSubProp, subPath, properties);
190            }
191        }
192    }
193
194    protected void findBlobPaths(ComplexType complexType, String path, Schema schema, List<String> paths) {
195        for (Field field : complexType.getFields()) {
196            String fieldPath = field.getName().getPrefixedName();
197            if (path == null) {
198                // add schema name as prefix if the schema doesn't have a prefix
199                if (!schema.getNamespace().hasPrefix()) {
200                    fieldPath = schema.getName() + ":" + fieldPath;
201                }
202            } else {
203                fieldPath = path + "/" + fieldPath;
204            }
205            Type type = field.getType();
206            if (type.isSimpleType()) {
207                continue; // not binary text
208            } else if (type.isListType()) {
209                Type fieldType = ((ListType) type).getFieldType();
210                if (fieldType.isComplexType()) {
211                    findBlobPaths((ComplexType) fieldType, fieldPath + "/*", schema, paths);
212                } else {
213                    continue; // not binary text
214                }
215            } else { // complex type
216                ComplexType ctype = (ComplexType) type;
217                if (TypeConstants.isContentType(type)) {
218                    // note this path
219                    paths.add(fieldPath);
220                } else {
221                    findBlobPaths(ctype, fieldPath, schema, paths);
222                }
223            }
224        }
225    }
226}