001/* 002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Olivier Grisel 016 */ 017package org.nuxeo.ecm.platform.categorization.categorizer; 018 019import java.util.Arrays; 020import java.util.HashMap; 021import java.util.List; 022import java.util.Map; 023 024import org.knallgrau.utils.textcat.TextCategorizer; 025import org.nuxeo.ecm.platform.categorization.service.Categorizer; 026 027/** 028 * Sample language guesser that straightforwardly use the pre-built models of the TextCat library. 029 */ 030public class LanguageCategorizer implements Categorizer { 031 032 protected final TextCategorizer languageGuesser; 033 034 protected final static Map<String, String> languageNameToISO639Code = new HashMap<String, String>(); 035 036 public LanguageCategorizer(String modelFile) { 037 if (modelFile != null) { 038 languageGuesser = new TextCategorizer(modelFile); 039 } else { 040 languageGuesser = new TextCategorizer(); 041 } 042 } 043 044 public List<String> guessCategories(String textContent, int maxSuggestions) { 045 // only return one, whatever max suggestion is 046 return Arrays.asList(languageGuesser.categorize(textContent)); 047 } 048 049 public List<String> guessCategories(String textContent, int maxSuggestions, Double precisionTreshold) { 050 // languageGuesser does not support setting a custom threshold 051 return guessCategories(textContent, maxSuggestions); 052 } 053 054}