This example is inspired by the second paragraph of the paper
Matrices, vector spaces, and information retrieval. It shows a vector space representation of information used to represent documents in a collection and the query algorithm to find relevant documents. This example implement the model and the query matching algorithm using the linear algebra module provided by numpy. The program is tested on the sample data in Figure 2 of the paper.
import numpy
def buildTermDocumentMatrix(terms,docs):
""" build a term-document matrix """
tlen = len(terms)
dlen = len(docs)
A = numpy.zeros((tlen, dlen))
for i,t in enumerate(terms):
for j,d in enumerate(docs):
A[i,j] = d.lower().count(t) # computing terms frequencies
for i in range(dlen): # normalize columns
A[:tlen,i] = A[:tlen,i]/numpy.linalg.norm(A[:tlen,i])
return A
def query(A,q,docs):
""" make the query and print the result """
q = q/numpy.linalg.norm(q) # normalize query vector
for i in range(len(docs)):
# dot product
print '-Doc :',docs[i],'\n-Match:',numpy.dot(A[:6,i].T,q)
# documents collection
docs =['How to Bake Bread Without Recipes',
'The Classic Art of Viennese Pastry',
'Numerical Recipes: The Art of Scientific Computing',
'Breads, Pastries, Pies and Cakes : Quantity Baking Recipes',
'Pastry: A Book of Best French Recipe']
# interesting terms
terms = ['bak','recipe','bread','cake','pastr','pie']
# will return a matrix 6 terms x 5 documents
A = buildTermDocumentMatrix(terms,docs)
print 'Normalized Terms-Documents matrix'
print A
print '\n*** Query: "bak(e,ing)" + "bread"'
q1 = numpy.array([1,0,1,0,0,0])
query(A,q1,docs)
print '\n*** Query: "bak(e,ing)" only'
q2 = numpy.array([1,0,0,0,0,0])
query(A,q2,docs)
The results are the same as is the reference paper:
Normalized Terms-Documents matrix
[[ 0.57735027 0. 0. 0.40824829 0. ]
[ 0.57735027 0. 1. 0.40824829 0.70710678]
[ 0.57735027 0. 0. 0.40824829 0. ]
[ 0. 0. 0. 0.40824829 0. ]
[ 0. 1. 0. 0.40824829 0.70710678]
[ 0. 0. 0. 0.40824829 0. ]]
*** Query: "bak(e,ing)" + "bread"
-Doc : How to Bake Bread Without Recipes
-Match: 0.816496580928
-Doc : The Classic Art of Viennese Pastry
-Match: 0.0
-Doc : Numerical Recipes: The Art of Scientific Computing
-Match: 0.0
-Doc : Breads, Pastries, Pies and Cakes : Quantity Baking Recipes
-Match: 0.57735026919
-Doc : Pastry: A Book of Best French Recipe
-Match: 0.0
*** Query: "bak(e,ing)" only
-Doc : How to Bake Bread Without Recipes
-Match: 0.57735026919
-Doc : The Classic Art of Viennese Pastry
-Match: 0.0
-Doc : Numerical Recipes: The Art of Scientific Computing
-Match: 0.0
-Doc : Breads, Pastries, Pies and Cakes : Quantity Baking Recipes
-Match: 0.408248290464
-Doc : Pastry: A Book of Best French Recipe
-Match: 0.0
Other resources about about the model implemented can be found here: