001/* 002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Olivier Grisel 018 */ 019package org.nuxeo.ecm.platform.categorization.categorizer; 020 021import java.util.Arrays; 022import java.util.HashMap; 023import java.util.List; 024import java.util.Map; 025 026import org.knallgrau.utils.textcat.TextCategorizer; 027import org.nuxeo.ecm.platform.categorization.service.Categorizer; 028 029/** 030 * Sample language guesser that straightforwardly use the pre-built models of the TextCat library. 031 */ 032public class LanguageCategorizer implements Categorizer { 033 034 protected final TextCategorizer languageGuesser; 035 036 protected final static Map<String, String> languageNameToISO639Code = new HashMap<String, String>(); 037 038 public LanguageCategorizer(String modelFile) { 039 if (modelFile != null) { 040 languageGuesser = new TextCategorizer(modelFile); 041 } else { 042 languageGuesser = new TextCategorizer(); 043 } 044 } 045 046 public List<String> guessCategories(String textContent, int maxSuggestions) { 047 // only return one, whatever max suggestion is 048 return Arrays.asList(languageGuesser.categorize(textContent)); 049 } 050 051 public List<String> guessCategories(String textContent, int maxSuggestions, Double precisionTreshold) { 052 // languageGuesser does not support setting a custom threshold 053 return guessCategories(textContent, maxSuggestions); 054 } 055 056}