001/* 002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Benjamin Jalon 018 * Florent Guillaume 019 */ 020package org.nuxeo.ecm.core.utils; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.HashSet; 025import java.util.List; 026import java.util.Map; 027import java.util.Set; 028import java.util.concurrent.ConcurrentHashMap; 029 030import org.nuxeo.ecm.core.api.Blob; 031import org.nuxeo.ecm.core.api.DocumentModel; 032import org.nuxeo.ecm.core.api.model.Property; 033import org.nuxeo.ecm.core.schema.DocumentType; 034import org.nuxeo.ecm.core.schema.SchemaManager; 035import org.nuxeo.ecm.core.schema.TypeConstants; 036import org.nuxeo.ecm.core.schema.types.ComplexType; 037import org.nuxeo.ecm.core.schema.types.Field; 038import org.nuxeo.ecm.core.schema.types.ListType; 039import org.nuxeo.ecm.core.schema.types.Schema; 040import org.nuxeo.ecm.core.schema.types.Type; 041import org.nuxeo.runtime.api.Framework; 042 043/** 044 * Extractor for all the blobs of a document. 045 */ 046public class BlobsExtractor { 047 048 protected final Map<String, List<String>> docBlobPaths = new ConcurrentHashMap<>(); 049 050 private Set<String> includedPaths; 051 052 private Set<String> excludedPaths; 053 054 private boolean allBlobs; 055 056 private boolean isDefaultConfiguration = true; 057 058 /** 059 * Sets extractor properties, controlling what properties or values are returned by {@link #getBlobsProperties} or 060 * {@link #getBlobs}. 061 * <p> 062 * The properties have to be defined without prefix if there is no prefix in the schema definition. For blob 063 * properties, the path must include the {@code /data} part. 064 */ 065 public void setExtractorProperties(Set<String> includedPaths, Set<String> excludedPaths, boolean allBlobs) { 066 this.includedPaths = normalizePaths(includedPaths); 067 this.excludedPaths = normalizePaths(excludedPaths); 068 this.allBlobs = allBlobs; 069 isDefaultConfiguration = includedPaths == null && excludedPaths == null && allBlobs; 070 } 071 072 protected boolean isInterestingPath(String path) { 073 if (isDefaultConfiguration) { 074 return true; 075 } else if (excludedPaths != null && excludedPaths.contains(path)) { 076 return false; 077 } else if (includedPaths != null && includedPaths.contains(path)) { 078 return true; 079 } else if (allBlobs) { 080 return true; 081 } 082 return false; 083 } 084 085 /** 086 * Removes the "/data" suffix used by FulltextConfiguration. 087 * <p> 088 * Adds missing schema name as prefix if no prefix ("content" -> "file:content"). 089 */ 090 protected Set<String> normalizePaths(Set<String> paths) { 091 if (paths == null) { 092 return null; 093 } 094 SchemaManager schemaManager = Framework.getService(SchemaManager.class); 095 Set<String> normPaths = new HashSet<>(); 096 for (String path : paths) { 097 // remove "/data" suffix 098 if (path.endsWith("/data")) { 099 path = path.substring(0, path.length() - "/data".length()); 100 } 101 // add schema if no schema prefix 102 if (schemaManager.getField(path) == null && !path.contains(":")) { 103 // check without prefix 104 // TODO precompute this in SchemaManagerImpl 105 int slash = path.indexOf('/'); 106 String first = slash == -1 ? path : path.substring(0, slash); 107 for (Schema schema : schemaManager.getSchemas()) { 108 if (!schema.getNamespace().hasPrefix()) { 109 // schema without prefix, try it 110 if (schema.getField(first) != null) { 111 path = schema.getName() + ":" + path; 112 break; 113 } 114 } 115 } 116 } 117 normPaths.add(path); 118 } 119 return normPaths; 120 } 121 122 /** 123 * Gets the blobs of the document. 124 * 125 * @param doc the document 126 * @return the list of blobs 127 */ 128 public List<Blob> getBlobs(DocumentModel doc) { 129 List<Blob> blobs = new ArrayList<>(); 130 for (Property property : getBlobsProperties(doc)) { 131 blobs.add((Blob) property.getValue()); 132 } 133 return blobs; 134 } 135 136 /** 137 * Gets the blob properties of the document. 138 * 139 * @param doc the document 140 * @return the list of blob properties 141 */ 142 public List<Property> getBlobsProperties(DocumentModel doc) { 143 List<Property> properties = new ArrayList<>(); 144 for (String path : getBlobPaths(doc.getDocumentType())) { 145 if (!isInterestingPath(path)) { 146 continue; 147 } 148 // split on: 149 // - "[*]" for list 150 // - "/" for complex properties 151 List<String> split = Arrays.asList(path.split("/[*]/|/")); 152 if (split.isEmpty()) { 153 throw new IllegalStateException("Path detected not well-formed: " + path); 154 } 155 Property property = doc.getProperty(split.get(0)); 156 List<String> subPath = split.subList(1, split.size()); 157 findBlobsProperties(property, subPath, properties); 158 } 159 return properties; 160 } 161 162 /** 163 * Gets the blob paths of the document type. Extractor properties are ignored. 164 * 165 * @param documentType the document type 166 * @return the list of blob paths 167 * @since 8.3 168 */ 169 public List<String> getBlobPaths(DocumentType documentType) { 170 String docType = documentType.getName(); 171 List<String> paths = docBlobPaths.get(docType); 172 if (paths == null) { 173 paths = new ArrayList<>(); 174 for (Schema schema : documentType.getSchemas()) { 175 findBlobPaths(schema, null, schema, paths); 176 } 177 docBlobPaths.put(docType, paths); 178 } 179 return paths; 180 } 181 182 protected void findBlobsProperties(Property property, List<String> split, List<Property> properties) { 183 if (split.isEmpty()) { 184 if (property.getValue() != null) { 185 properties.add(property); 186 } 187 } else { 188 String name = split.get(0); 189 List<String> subPath = split.subList(1, split.size()); 190 if (property.isList()) { 191 for (Property childProperty : property.getChildren()) { 192 Property childSubProp = childProperty.get(name); 193 findBlobsProperties(childSubProp, subPath, properties); 194 } 195 } else { // complex type 196 Property childSubProp = property.get(name); 197 findBlobsProperties(childSubProp, subPath, properties); 198 } 199 } 200 } 201 202 protected void findBlobPaths(ComplexType complexType, String path, Schema schema, List<String> paths) { 203 for (Field field : complexType.getFields()) { 204 String fieldPath = field.getName().getPrefixedName(); 205 if (path == null) { 206 // add schema name as prefix if the schema doesn't have a prefix 207 if (!schema.getNamespace().hasPrefix()) { 208 fieldPath = schema.getName() + ":" + fieldPath; 209 } 210 } else { 211 fieldPath = path + "/" + fieldPath; 212 } 213 Type type = field.getType(); 214 if (type.isSimpleType()) { 215 continue; // not binary text 216 } else if (type.isListType()) { 217 Type fieldType = ((ListType) type).getFieldType(); 218 if (fieldType.isComplexType()) { 219 findBlobPaths((ComplexType) fieldType, fieldPath + "/*", schema, paths); 220 } else { 221 continue; // not binary text 222 } 223 } else { // complex type 224 ComplexType ctype = (ComplexType) type; 225 if (TypeConstants.isContentType(type)) { 226 // note this path 227 paths.add(fieldPath); 228 } else { 229 findBlobPaths(ctype, fieldPath, schema, paths); 230 } 231 } 232 } 233 } 234}