-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathterm_document_matrix.rb
More file actions
38 lines (30 loc) · 959 Bytes
/
term_document_matrix.rb
File metadata and controls
38 lines (30 loc) · 959 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
require 'gsl'
class TermDocumentMatrix
attr_reader :matrix, :labels, :number_of_terms, :number_of_documents, :non_zeros
def initialize(corpus)
@matrix = GSL::Matrix.alloc(corpus.terms.size, corpus.document_count)
@non_zeros = 0
@number_of_terms = corpus.terms.size
@number_of_documents = corpus.documents.size
corpus.terms.each_with_index do |term, term_index|
term = term.first
idf = corpus.inverse_document_frequency(term)
corpus.documents.each_with_index do |document, document_index|
weight = document.term_frequency(term) * idf
@matrix[term_index, document_index] = weight
@non_zeros += 1 unless weight.zero?
end
end
@matrix.each_col { |col| col.div!(col.norm) }
@labels = corpus.terms.to_a.map {|e| e[0]}
end
def similarity_matrix
self.matrix.transpose * self.matrix
end
def col(idx)
@matrix.col(idx)
end
def to_a
@matrix.to_a
end
end