001/* 002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Olivier Grisel 018 */ 019package org.nuxeo.ecm.platform.categorization.categorizer.tfidf; 020 021import java.io.Serializable; 022import java.util.List; 023 024/** 025 * Hashed vector representation of the token unigrams and bigrams of a document provided as a sequence of tokens. 026 * <p> 027 * We use a hash representations to be able to maintain low memory requirements by avoiding to store an explicit map 028 * from string bigrams to feature vector index in memory. 029 * <p> 030 * http://hunch.net/~jl/projects/hash_reps/index.html http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters 031 * 032 * @author ogrisel 033 */ 034public class HashingVectorizer implements Serializable { 035 036 private static final long serialVersionUID = 1L; 037 038 protected int dim = 524288; // 2 ** 19 039 040 protected int probes = 2; 041 042 protected int window = 0; 043 044 /** 045 * Chain configuration of the number of buckets, which is also the number of the vectors dimensions, small values 046 * mean high probabilities of collisions. 047 */ 048 public HashingVectorizer dimension(int dim) { 049 this.dim = dim; 050 return this; 051 } 052 053 /** 054 * Chain configuration of the number of terms to hash together: window = 1 means unigrams and bigrams, window = 3 055 * would add bigrams of distance 2, and so on. 056 */ 057 public HashingVectorizer window(int window) { 058 this.window = window; 059 return this; 060 } 061 062 /** 063 * Chain configuration of the number of probes, i.e. number of distinct hash functions to use for each ngram count. 064 */ 065 public HashingVectorizer probes(int probes) { 066 this.probes = probes; 067 return this; 068 } 069 070 // TODO: implement a sparse equivalent using an long[] for the indices of 071 // non-zero values and a second count[] for the counts them-selves 072 public long[] count(List<String> tokens) { 073 long[] counts = new long[dim]; 074 addCounts(tokens, counts); 075 return counts; 076 } 077 078 public void addCounts(List<String> tokens, long[] counts) { 079 int n = 0; 080 for (String token : tokens) { 081 for (int probe = 0; probe < probes; probe++) { 082 counts[hash(token, probe)]++; 083 } 084 if (window > 0) { 085 for (int j = Math.max(0, n - window); j < n; j++) { 086 for (int probe = 0; probe < probes; probe++) { 087 counts[hash(token, tokens.get(j), probe)]++; 088 } 089 } 090 } 091 n++; 092 } 093 } 094 095 protected int hash(String token, int probe) { 096 return hash(token, null, probe); 097 } 098 099 protected int hash(String token, String prevToken, int probe) { 100 int h = (token + " " + prevToken + " " + probe).hashCode() % dim; 101 if (h < 0) { 102 h += dim; 103 } 104 return h; 105 } 106 107}