001/* 002 * (C) Copyright 2018 Nuxeo (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019package org.nuxeo.ecm.core.utils; 020 021import java.io.Serializable; 022import java.util.ArrayList; 023import java.util.List; 024import java.util.Set; 025 026import org.nuxeo.ecm.core.api.DocumentModel; 027import org.nuxeo.ecm.core.api.model.Property; 028import org.nuxeo.ecm.core.api.model.impl.ArrayProperty; 029import org.nuxeo.ecm.core.api.model.impl.ComplexProperty; 030import org.nuxeo.ecm.core.api.model.impl.ListProperty; 031import org.nuxeo.ecm.core.api.model.impl.primitives.StringProperty; 032 033/** 034 * Finds the strings in a document (string properties). 035 * <p> 036 * This class is not thread-safe. 037 * 038 * @since 10.3 039 */ 040public class StringsExtractor { 041 042 protected DocumentModel document; 043 044 // paths for which we extract fulltext, or null for all 045 protected Set<String> includedPaths; 046 047 protected Set<String> excludedPaths; 048 049 // collected strings 050 protected List<String> strings; 051 052 /** 053 * Finds strings from the document for a given set of included and excluded paths. 054 * <p> 055 * Paths must be specified with a schema prefix in all cases (normalized). 056 * 057 * @param document the document 058 * @param includedPaths the paths to include, or {@code null} for all paths 059 * @param excludedPaths the paths to exclude, or {@code null} for none 060 * @return a list of strings (each string is never {@code null}) 061 */ 062 public List<String> findStrings(DocumentModel document, Set<String> includedPaths, Set<String> excludedPaths) { 063 this.document = document; 064 this.includedPaths = includedPaths; 065 this.excludedPaths = excludedPaths; 066 strings = new ArrayList<>(); 067 for (String schema : document.getSchemas()) { 068 for (Property property : document.getPropertyObjects(schema)) { 069 String path = property.getField().getName().getPrefixedName(); 070 if (!path.contains(":")) { 071 // add schema name as prefix if the schema doesn't have a prefix 072 path = property.getSchema().getName() + ":" + path; 073 } 074 findStrings(property, path); 075 } 076 } 077 return strings; 078 } 079 080 protected boolean isInterestingPath(String path) { 081 if (excludedPaths != null && excludedPaths.contains(path)) { 082 return false; 083 } 084 return includedPaths == null || includedPaths.contains(path); 085 } 086 087 protected void findStrings(Property property, String path) { 088 if (property instanceof StringProperty) { 089 if (isInterestingPath(path)) { 090 Serializable value = property.getValue(); 091 if (value instanceof String) { 092 strings.add((String) value); 093 } 094 } 095 } else if (property instanceof ArrayProperty) { 096 if (isInterestingPath(path)) { 097 Serializable value = property.getValue(); 098 if (value instanceof Object[]) { 099 for (Object v : (Object[]) value) { 100 if (v instanceof String) { 101 strings.add((String) v); 102 } 103 } 104 } 105 } 106 } else if (property instanceof ComplexProperty) { 107 for (Property p : ((ComplexProperty) property).getChildren()) { 108 String pp = p.getField().getName().getPrefixedName(); 109 findStrings(p, path + '/' + pp); 110 } 111 } else if (property instanceof ListProperty) { 112 for (Property p : (ListProperty) property) { 113 findStrings(p, path + "/*"); 114 } 115 } 116 } 117}