001/*
002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Olivier Grisel
016 */
017package org.nuxeo.ecm.platform.categorization.categorizer;
018
019import java.util.Arrays;
020import java.util.HashMap;
021import java.util.List;
022import java.util.Map;
023
024import org.knallgrau.utils.textcat.TextCategorizer;
025import org.nuxeo.ecm.platform.categorization.service.Categorizer;
026
027/**
028 * Sample language guesser that straightforwardly use the pre-built models of the TextCat library.
029 */
030public class LanguageCategorizer implements Categorizer {
031
032    protected final TextCategorizer languageGuesser;
033
034    protected final static Map<String, String> languageNameToISO639Code = new HashMap<String, String>();
035
036    public LanguageCategorizer(String modelFile) {
037        if (modelFile != null) {
038            languageGuesser = new TextCategorizer(modelFile);
039        } else {
040            languageGuesser = new TextCategorizer();
041        }
042    }
043
044    public List<String> guessCategories(String textContent, int maxSuggestions) {
045        // only return one, whatever max suggestion is
046        return Arrays.asList(languageGuesser.categorize(textContent));
047    }
048
049    public List<String> guessCategories(String textContent, int maxSuggestions, Double precisionTreshold) {
050        // languageGuesser does not support setting a custom threshold
051        return guessCategories(textContent, maxSuggestions);
052    }
053
054}