001/*
002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Olivier Grisel
018 */
019package org.nuxeo.ecm.platform.categorization.categorizer;
020
021import java.util.Arrays;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025
026import org.knallgrau.utils.textcat.TextCategorizer;
027import org.nuxeo.ecm.platform.categorization.service.Categorizer;
028
029/**
030 * Sample language guesser that straightforwardly use the pre-built models of the TextCat library.
031 */
032public class LanguageCategorizer implements Categorizer {
033
034    protected final TextCategorizer languageGuesser;
035
036    protected final static Map<String, String> languageNameToISO639Code = new HashMap<String, String>();
037
038    public LanguageCategorizer(String modelFile) {
039        if (modelFile != null) {
040            languageGuesser = new TextCategorizer(modelFile);
041        } else {
042            languageGuesser = new TextCategorizer();
043        }
044    }
045
046    public List<String> guessCategories(String textContent, int maxSuggestions) {
047        // only return one, whatever max suggestion is
048        return Arrays.asList(languageGuesser.categorize(textContent));
049    }
050
051    public List<String> guessCategories(String textContent, int maxSuggestions, Double precisionTreshold) {
052        // languageGuesser does not support setting a custom threshold
053        return guessCategories(textContent, maxSuggestions);
054    }
055
056}