001/*
002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.storage;
020
021import org.nuxeo.ecm.core.api.DocumentLocation;
022
023import java.util.List;
024
025/**
026 * Parser of strings for fulltext indexing.
027 * <p>
028 * From the strings extracted from the document, decides how they should be parsed, split and normalized for fulltext
029 * indexing by the underlying engine.
030 *
031 * @since 5.9.5
032 */
033public interface FulltextParser {
034
035    /**
036     * Parses one property value to normalize the fulltext for the database.
037     * <p>
038     * The passed {@code path} may be {@code null} if the passed string is not coming from a specific path, for instance
039     * when it was extracted from binary data.
040     *
041     * @param s the string to be parsed and normalized
042     * @param path the abstracted path for the property (where all complex indexes have been replaced by {@code *}), or
043     *            {@code null}
044     * @return the normalized words as a single space-separated string
045     */
046    String parse(String s, String path);
047
048    /**
049     * Parses one property value to normalize the fulltext for the database.
050     * <p>
051     * Like {@link #parse(String, String)} but uses the passed list to accumulate words.
052     *
053     * @param s the string to be parsed and normalized
054     * @param path the abstracted path for the property (where all complex indexes have been replaced by {@code *}), or
055     *            {@code null}
056     * @param strings the list into which normalized words should be accumulated
057     */
058    void parse(String s, String path, List<String> strings);
059
060    /**
061     * Parses one property value to normalize the fulltext for the database.
062     * <p>
063     * The passed {@code path} may be {@code null} if the passed string is not coming from a specific path, for instance
064     * when it was extracted from binary data.
065     *
066     * @param s the string to be parsed and normalized
067     * @param path the abstracted path for the property (where all complex indexes have been replaced by {@code *}), or
068     *            {@code null}
069     * @param mimeType the {@code mimeType} of the string to be parsed and normalized. This may be {@code null}
070     * @param documentLocation the {@code documentLocation} of the Document from which the property value string
071     *            was extracted. This may be {@code null}
072     * @return the normalized words as a single space-separated string
073     * @since 8.4
074     */
075    String parse(String s, String path, String mimeType, DocumentLocation documentLocation);
076
077    /**
078     * Parses one property value to normalize the fulltext for the database.
079     * <p>
080     * Like {@link #parse(String, String)} but uses the passed list to accumulate words.
081     *
082     * @param s the string to be parsed and normalized
083     * @param path the abstracted path for the property (where all complex indexes have been replaced by {@code *}), or
084     *            {@code null}
085     * @param mimeType the {@code mimeType} of the string to be parsed and normalized. This may be {@code null}
086     * @param documentLocation the {@code documentLocation} of the Document from which the property value string
087     *            was extracted. This may be {@code null}
088     * @param strings the list into which normalized words should be accumulated
089     * @since 8.4
090     */
091    void parse(String s, String path, String mimeType, DocumentLocation documentLocation, List<String> strings);
092
093}