001/* 002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Olivier Grisel 016 */ 017package org.nuxeo.ecm.platform.categorization.categorizer.tfidf; 018 019import java.io.Serializable; 020import java.util.List; 021 022/** 023 * Hashed vector representation of the token unigrams and bigrams of a document provided as a sequence of tokens. 024 * <p> 025 * We use a hash representations to be able to maintain low memory requirements by avoiding to store an explicit map 026 * from string bigrams to feature vector index in memory. 027 * <p> 028 * http://hunch.net/~jl/projects/hash_reps/index.html http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters 029 * 030 * @author ogrisel 031 */ 032public class HashingVectorizer implements Serializable { 033 034 private static final long serialVersionUID = 1L; 035 036 protected int dim = 524288; // 2 ** 19 037 038 protected int probes = 2; 039 040 protected int window = 0; 041 042 /** 043 * Chain configuration of the number of buckets, which is also the number of the vectors dimensions, small values 044 * mean high probabilities of collisions. 045 */ 046 public HashingVectorizer dimension(int dim) { 047 this.dim = dim; 048 return this; 049 } 050 051 /** 052 * Chain configuration of the number of terms to hash together: window = 1 means unigrams and bigrams, window = 3 053 * would add bigrams of distance 2, and so on. 054 */ 055 public HashingVectorizer window(int window) { 056 this.window = window; 057 return this; 058 } 059 060 /** 061 * Chain configuration of the number of probes, i.e. number of distinct hash functions to use for each ngram count. 062 */ 063 public HashingVectorizer probes(int probes) { 064 this.probes = probes; 065 return this; 066 } 067 068 // TODO: implement a sparse equivalent using an long[] for the indices of 069 // non-zero values and a second count[] for the counts them-selves 070 public long[] count(List<String> tokens) { 071 long[] counts = new long[dim]; 072 addCounts(tokens, counts); 073 return counts; 074 } 075 076 public void addCounts(List<String> tokens, long[] counts) { 077 int n = 0; 078 for (String token : tokens) { 079 for (int probe = 0; probe < probes; probe++) { 080 counts[hash(token, probe)]++; 081 } 082 if (window > 0) { 083 for (int j = Math.max(0, n - window); j < n; j++) { 084 for (int probe = 0; probe < probes; probe++) { 085 counts[hash(token, tokens.get(j), probe)]++; 086 } 087 } 088 } 089 n++; 090 } 091 } 092 093 protected int hash(String token, int probe) { 094 return hash(token, null, probe); 095 } 096 097 protected int hash(String token, String prevToken, int probe) { 098 int h = (token + " " + prevToken + " " + probe).hashCode() % dim; 099 if (h < 0) { 100 h += dim; 101 } 102 return h; 103 } 104 105}