001/*
002 * (C) Copyright 2018 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.utils;
020
021import java.io.Serializable;
022import java.util.ArrayList;
023import java.util.List;
024import java.util.Set;
025
026import org.nuxeo.ecm.core.api.DocumentModel;
027import org.nuxeo.ecm.core.api.model.Property;
028import org.nuxeo.ecm.core.api.model.impl.ArrayProperty;
029import org.nuxeo.ecm.core.api.model.impl.ComplexProperty;
030import org.nuxeo.ecm.core.api.model.impl.ListProperty;
031import org.nuxeo.ecm.core.api.model.impl.primitives.StringProperty;
032
033/**
034 * Finds the strings in a document (string properties).
035 * <p>
036 * This class is not thread-safe.
037 *
038 * @since 10.3
039 */
040public class StringsExtractor {
041
042    protected DocumentModel document;
043
044    // paths for which we extract fulltext, or null for all
045    protected Set<String> includedPaths;
046
047    protected Set<String> excludedPaths;
048
049    // collected strings
050    protected List<String> strings;
051
052    /**
053     * Finds strings from the document for a given set of included and excluded paths.
054     * <p>
055     * Paths must be specified with a schema prefix in all cases (normalized).
056     *
057     * @param document the document
058     * @param includedPaths the paths to include, or {@code null} for all paths
059     * @param excludedPaths the paths to exclude, or {@code null} for none
060     * @return a list of strings (each string is never {@code null})
061     */
062    public List<String> findStrings(DocumentModel document, Set<String> includedPaths, Set<String> excludedPaths) {
063        this.document = document;
064        this.includedPaths = includedPaths;
065        this.excludedPaths = excludedPaths;
066        strings = new ArrayList<>();
067        for (String schema : document.getSchemas()) {
068            for (Property property : document.getPropertyObjects(schema)) {
069                String path = property.getField().getName().getPrefixedName();
070                if (!path.contains(":")) {
071                    // add schema name as prefix if the schema doesn't have a prefix
072                    path = property.getSchema().getName() + ":" + path;
073                }
074                findStrings(property, path);
075            }
076        }
077        return strings;
078    }
079
080    protected boolean isInterestingPath(String path) {
081        if (excludedPaths != null && excludedPaths.contains(path)) {
082            return false;
083        }
084        return includedPaths == null || includedPaths.contains(path);
085    }
086
087    protected void findStrings(Property property, String path) {
088        if (property instanceof StringProperty) {
089            if (isInterestingPath(path)) {
090                Serializable value = property.getValue();
091                if (value instanceof String) {
092                    strings.add((String) value);
093                }
094            }
095        } else if (property instanceof ArrayProperty) {
096            if (isInterestingPath(path)) {
097                Serializable value = property.getValue();
098                if (value instanceof Object[]) {
099                    for (Object v : (Object[]) value) {
100                        if (v instanceof String) {
101                            strings.add((String) v);
102                        }
103                    }
104                }
105            }
106        } else if (property instanceof ComplexProperty) {
107            for (Property p : ((ComplexProperty) property).getChildren()) {
108                String pp = p.getField().getName().getPrefixedName();
109                findStrings(p, path + '/' + pp);
110            }
111        } else if (property instanceof ListProperty) {
112            for (Property p : (ListProperty) property) {
113                findStrings(p, path + "/*");
114            }
115        }
116    }
117}