001/*
002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Benjamin Jalon
018 *     Florent Guillaume
019 */
020package org.nuxeo.ecm.core.utils;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Map;
027import java.util.Set;
028import java.util.concurrent.ConcurrentHashMap;
029
030import org.nuxeo.ecm.core.api.Blob;
031import org.nuxeo.ecm.core.api.DocumentModel;
032import org.nuxeo.ecm.core.api.model.Property;
033import org.nuxeo.ecm.core.schema.DocumentType;
034import org.nuxeo.ecm.core.schema.SchemaManager;
035import org.nuxeo.ecm.core.schema.TypeConstants;
036import org.nuxeo.ecm.core.schema.types.ComplexType;
037import org.nuxeo.ecm.core.schema.types.Field;
038import org.nuxeo.ecm.core.schema.types.ListType;
039import org.nuxeo.ecm.core.schema.types.Schema;
040import org.nuxeo.ecm.core.schema.types.Type;
041import org.nuxeo.runtime.api.Framework;
042
043/**
044 * Extractor for all the blobs of a document.
045 */
046public class BlobsExtractor {
047
048    protected final Map<String, List<String>> docBlobPaths = new ConcurrentHashMap<>();
049
050    private Set<String> includedPaths;
051
052    private Set<String> excludedPaths;
053
054    private boolean allBlobs;
055
056    private boolean isDefaultConfiguration = true;
057
058    /**
059     * Sets extractor properties, controlling what properties or values are returned by {@link #getBlobsProperties} or
060     * {@link #getBlobs}.
061     * <p>
062     * The properties have to be defined without prefix if there is no prefix in the schema definition. For blob
063     * properties, the path must include the {@code /data} part.
064     */
065    public void setExtractorProperties(Set<String> includedPaths, Set<String> excludedPaths, boolean allBlobs) {
066        this.includedPaths = normalizePaths(includedPaths);
067        this.excludedPaths = normalizePaths(excludedPaths);
068        this.allBlobs = allBlobs;
069        isDefaultConfiguration = includedPaths == null && excludedPaths == null && allBlobs;
070    }
071
072    protected boolean isInterestingPath(String path) {
073        if (isDefaultConfiguration) {
074            return true;
075        } else if (excludedPaths != null && excludedPaths.contains(path)) {
076            return false;
077        } else if (includedPaths != null && includedPaths.contains(path)) {
078            return true;
079        } else if (allBlobs) {
080            return true;
081        }
082        return false;
083    }
084
085    /**
086     * Removes the "/data" suffix used by FulltextConfiguration.
087     * <p>
088     * Adds missing schema name as prefix if no prefix ("content" -&gt; "file:content").
089     */
090    protected Set<String> normalizePaths(Set<String> paths) {
091        if (paths == null) {
092            return null;
093        }
094        SchemaManager schemaManager = Framework.getService(SchemaManager.class);
095        Set<String> normPaths = new HashSet<>();
096        for (String path : paths) {
097            // remove "/data" suffix
098            if (path.endsWith("/data")) {
099                path = path.substring(0, path.length() - "/data".length());
100            }
101            // add schema if no schema prefix
102            if (schemaManager.getField(path) == null && !path.contains(":")) {
103                // check without prefix
104                // TODO precompute this in SchemaManagerImpl
105                int slash = path.indexOf('/');
106                String first = slash == -1 ? path : path.substring(0, slash);
107                for (Schema schema : schemaManager.getSchemas()) {
108                    if (!schema.getNamespace().hasPrefix()) {
109                        // schema without prefix, try it
110                        if (schema.getField(first) != null) {
111                            path = schema.getName() + ":" + path;
112                            break;
113                        }
114                    }
115                }
116            }
117            normPaths.add(path);
118        }
119        return normPaths;
120    }
121
122    /**
123     * Gets the blobs of the document.
124     *
125     * @param doc the document
126     * @return the list of blobs
127     */
128    public List<Blob> getBlobs(DocumentModel doc) {
129        List<Blob> blobs = new ArrayList<>();
130        for (Property property : getBlobsProperties(doc)) {
131            blobs.add((Blob) property.getValue());
132        }
133        return blobs;
134    }
135
136    /**
137     * Gets the blob properties of the document.
138     *
139     * @param doc the document
140     * @return the list of blob properties
141     */
142    public List<Property> getBlobsProperties(DocumentModel doc) {
143        List<Property> properties = new ArrayList<>();
144        for (String path : getBlobPaths(doc.getDocumentType())) {
145            if (!isInterestingPath(path)) {
146                continue;
147            }
148            // split on:
149            // - "[*]" for list
150            // - "/" for complex properties
151            List<String> split = Arrays.asList(path.split("/[*]/|/"));
152            if (split.isEmpty()) {
153                throw new IllegalStateException("Path detected not well-formed: " + path);
154            }
155            Property property = doc.getProperty(split.get(0));
156            List<String> subPath = split.subList(1, split.size());
157            findBlobsProperties(property, subPath, properties);
158        }
159        return properties;
160    }
161
162    /**
163     * Gets the blob paths of the document type. Extractor properties are ignored.
164     *
165     * @param documentType the document type
166     * @return the list of blob paths
167     * @since 8.3
168     */
169    public List<String> getBlobPaths(DocumentType documentType) {
170        String docType = documentType.getName();
171        List<String> paths = docBlobPaths.get(docType);
172        if (paths == null) {
173            paths = new ArrayList<>();
174            for (Schema schema : documentType.getSchemas()) {
175                findBlobPaths(schema, null, schema, paths);
176            }
177            docBlobPaths.put(docType, paths);
178        }
179        return paths;
180    }
181
182    protected void findBlobsProperties(Property property, List<String> split, List<Property> properties) {
183        if (split.isEmpty()) {
184            if (property.getValue() != null) {
185                properties.add(property);
186            }
187        } else {
188            String name = split.get(0);
189            List<String> subPath = split.subList(1, split.size());
190            if (property.isList()) {
191                for (Property childProperty : property.getChildren()) {
192                    Property childSubProp = childProperty.get(name);
193                    findBlobsProperties(childSubProp, subPath, properties);
194                }
195            } else { // complex type
196                Property childSubProp = property.get(name);
197                findBlobsProperties(childSubProp, subPath, properties);
198            }
199        }
200    }
201
202    protected void findBlobPaths(ComplexType complexType, String path, Schema schema, List<String> paths) {
203        for (Field field : complexType.getFields()) {
204            String fieldPath = field.getName().getPrefixedName();
205            if (path == null) {
206                // add schema name as prefix if the schema doesn't have a prefix
207                if (!schema.getNamespace().hasPrefix()) {
208                    fieldPath = schema.getName() + ":" + fieldPath;
209                }
210            } else {
211                fieldPath = path + "/" + fieldPath;
212            }
213            Type type = field.getType();
214            if (type.isSimpleType()) {
215                continue; // not binary text
216            } else if (type.isListType()) {
217                Type fieldType = ((ListType) type).getFieldType();
218                if (fieldType.isComplexType()) {
219                    findBlobPaths((ComplexType) fieldType, fieldPath + "/*", schema, paths);
220                } else {
221                    continue; // not binary text
222                }
223            } else { // complex type
224                ComplexType ctype = (ComplexType) type;
225                if (TypeConstants.isContentType(type)) {
226                    // note this path
227                    paths.add(fieldPath);
228                } else {
229                    findBlobPaths(ctype, fieldPath, schema, paths);
230                }
231            }
232        }
233    }
234}