Skip to content
This repository has been archived by the owner on Feb 12, 2018. It is now read-only.

Commit

Permalink
Merge branch 'master' of https://github.com/ktisha/python2012
Browse files Browse the repository at this point in the history
  • Loading branch information
Liana committed Dec 7, 2012
2 parents aa8116b + 2903209 commit 7dd11eb
Show file tree
Hide file tree
Showing 38 changed files with 2,183 additions and 480 deletions.
1 change: 1 addition & 0 deletions alexeyev/hal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Anton M Alexeyev'
61 changes: 37 additions & 24 deletions alexeyev/converter.py → alexeyev/hal/converter.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
__author__ = 'Anton M Alexeyev'
# I decided to implement HAL, not SAM -- another method for cognitive studies and recommender systems

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from matrix_management import WordMatrix
import re
import nltk

# todo: filter after processing
# todo: pucntuation + not-a-word tokens to be thrown away (regexp probably)
window_size = 6

input_text = open("testtext", "r")
text = ""

for line in input_text:
text += line.lower() + " "

print "Text loaded"
input_text.close()

# maybe should first do sent_tokenize, then word_tokenize
print "Text loaded"
print "Learning..."

# i chose the one everybody knows
stemmer = PorterStemmer()
Expand All @@ -35,13 +33,14 @@
else:
tokens_filtered += [ token ]

tokens_filtered += ['*'] * (window_size - 1)

# stemming
#normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
normalized_tokens = [stemmer.stem(token) for token in tokens_filtered]

print "Tokens set filtered and stemmed :", normalized_tokens

window_size = 10
matrix = WordMatrix()

win_start = 0
Expand All @@ -51,32 +50,46 @@
second = 1
while first < len(window):
second = first + 1
set = []
while second < len(window):
matrix.add(window[first], window[second], window_size - second + first + 1)
if not (window[first], window[second]) in set:
matrix.add(window[first], window[second], 1)
set += [(window[first], window[second])]
second += 1
first += 1
win_start += 1

print "Co-occurence counted"
print "Keys quantity:", len(matrix.get_tokens())

"""
for key in matrix.get_tokens():
if key <> "*":
print key, matrix.kn_cooccurences(key, 6)

print "Now to more sophisticated analysis"

for key in matrix.get_tokens():
if key <> "*":
print key, matrix.kn_columns(key, 6, matrix.dist_cols_euclidean)
print key,
for succ in matrix.get_tokens():
print matrix.get(key, succ),
print
"""

print "Done"

"""
for token0 in matrix.get_tokens():
s += "\n" + token0
for token1 in matrix.get_tokens():
s += " " + str(matrix.get(token0, token1))
print s
"""
def get_token_by_word(word):
word = re.findall(r"[A-Za-z]+", word)[0]
return stemmer.stem(word.lower())

def get_euclidean_vector_by_token(n, token):
print "Incoming token:", token
if token in matrix.token_set:
return matrix.kn_columns(token, n, matrix.dist_cols_euclidean)
raise KeyError

def get_cosine_vector_by_token(n, token):
print "Incoming token:", token
if token in matrix.token_set:
return matrix.kn_columns(token, n, matrix.dist_cols_inverted_cosine)
raise KeyError

def get_frequential_vector_by_token(n, token):
print "Incoming token:", token
if token in matrix.token_set:
return matrix.kn_cooccurences(token, n)
raise KeyError
74 changes: 74 additions & 0 deletions alexeyev/hal/gui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-
from Tkinter import *
import tkFont
from converter import *

# main frame construction
root = Tk()

#label fonts construction
labelFont = tkFont.Font(family = "Courier", size = 18)
radioFont = tkFont.Font(family = "Courier", size = 11)

# header label construction
static_string = StringVar()
headerLabel = Label(root, textvariable = static_string, relief = FLAT, font = labelFont)
static_string.set(u"Введите слово:")
headerLabel.pack()

#text field construction
textFont = tkFont.Font(family = "Courier", size = 16)
text = Text(root, height = 2, width = 20, font = textFont)
text.insert(END, "person")
text.pack()

var = StringVar()

def print_suggestions():
input_text = text.get(1.0, END)
collector = []
string = ""
try:
if rb_var.get() == 1:
collector = get_euclidean_vector_by_token(7, get_token_by_word(input_text))
if rb_var.get() == 2:
collector = get_cosine_vector_by_token(7, get_token_by_word(input_text))
if rb_var.get() == 3:
collector = get_frequential_vector_by_token(7, get_token_by_word(input_text))
collector = [element[1] for element in collector]
except KeyError:
string = u"Такого элемента нет!"
for element in collector:
string += element + "\n"
var.set(string)
bottomLabel.pack()

# radiobuttons
rb_var = IntVar()

rb_eucl = Radiobutton(root, text = "Euclidean distance", variable = rb_var, value = 1, font = radioFont)
rb_cosine = Radiobutton(root, text = "Cosine similarity", variable = rb_var, value = 2, font = radioFont)
rb_freq = Radiobutton(root, text = "Sorted by w[i]", variable = rb_var, value = 3, font = radioFont)
rb_eucl.select()

rb_eucl.pack()
rb_cosine.pack()
rb_freq.pack()

# button construction
B = Button(root, text ="Go!", command = print_suggestions,
font = tkFont.Font(family = "Courier", size = 14))
B.pack()

# bottom header label construction
bhl_var = StringVar()
bhl_var.set(u"Термы:")
bottomHLabel = Label(root, textvariable = bhl_var, relief = FLAT, font = labelFont)
bottomHLabel.pack()

# bottom label construction
bottomLabel = Label(root, textvariable = var, relief = FLAT, font = labelFont)
bottomLabel.pack()

root.mainloop()
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__author__ = 'Anton M Alexeyev'

import blist
from blist import sortedset

class WordMatrix:
Expand Down Expand Up @@ -35,23 +36,26 @@ def dist_cols_euclidean(self, col0, col1):
"""Measures distance between 2 columns: Euclidean distance"""
collector = 0
for key in self.token_set:
collector += (self.get(key, col0) - self.get(key, col1))**2
collector += (self.get(col0, key) - self.get(col1, key))**2
collector += (self.get(key, col0) - self.get(key, col1)) ** 2
collector += (self.get(col0, key) - self.get(col1, key)) ** 2
return collector**0.5

def dist_cols_cosine(self, col0, col1):
def dist_cols_inverted_cosine(self, col0, col1):
"""Measures distance between 2 columns: Cosine similarity"""
length0 = 0.0
length1 = 0.0
collector = 0.0

for key in self.token_set:
collector += (self.get(key, col0) * self.get(key, col1))
collector += (self.get(col0, key) * self.get(col1, key))
length0 += 2 * (self.get(key, col0)**2)
length1 += 2 * (self.get(key, col1)**2)
k0 = self.get(key, col0)
k1 = self.get(key, col1)
collector += k0 * k1
collector += self.get(col0, key) * self.get(col1, key)
length0 += 2 * (k0**2)
length1 += 2 * (k1**2)
length0 **= 0.5
length1 **= 0.5
return collector / (length0 * length1)
return (0.0 + length0 * length1) / (collector + 0.0)

def kn_columns(self, target_column, k, dist_func):
"""Gets k nearest columns to target_column by distance function provided by dist_func"""
Expand Down
Loading

0 comments on commit 7dd11eb

Please sign in to comment.