Skip to content

Commit e235c45

Browse files
committed
Add book examples
1 parent f1de57b commit e235c45

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+7147
-0
lines changed

chapter10/Thumbs.db

5 KB
Binary file not shown.

chapter10/articles.txt

Lines changed: 500 additions & 0 deletions
Large diffs are not rendered by default.

chapter10/clusters.py

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
import random
2+
import math
3+
from math import sqrt
4+
from PIL import Image,ImageDraw,ImageFont
5+
6+
# Returns the Pearson correlation coefficient for p1 and p2
7+
def pearson(v1,v2):
8+
# Simple sums
9+
sum1=sum(v1)
10+
sum2=sum(v2)
11+
12+
# Sums of the squares
13+
sum1Sq=sum([pow(v,2) for v in v1])
14+
sum2Sq=sum([pow(v,2) for v in v2])
15+
16+
# Sum of the products
17+
pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
18+
19+
# Calculate r (Pearson score)
20+
num=pSum-(sum1*sum2/len(v1))
21+
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
22+
if den==0: return 0
23+
24+
return 1.0-(num/den)
25+
26+
27+
class bicluster:
28+
def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
29+
self.left=left
30+
self.right=right
31+
self.vec=vec
32+
self.id=id
33+
self.distance=distance
34+
35+
def euclidean(v1,v2):
36+
sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))])
37+
return math.sqrt(sqsum)
38+
39+
def printclust(clust,labels=None,n=0):
40+
for i in range(n): print ' ',
41+
if clust.id<0:
42+
print '-'
43+
else:
44+
if labels==None: print clust.id
45+
else: print labels[clust.id]
46+
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
47+
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
48+
49+
def hcluster(vecs,distance=pearson):
50+
distances={}
51+
currentclustid=-1
52+
clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))]
53+
54+
while len(clust)>1:
55+
lowestpair=(0,1)
56+
closest=distance(clust[0].vec,clust[1].vec)
57+
for i in range(len(clust)):
58+
for j in range(i+1,len(clust)):
59+
if (clust[i].id,clust[j].id) not in distances:
60+
distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
61+
d=distances[(clust[i].id,clust[j].id)]
62+
63+
if d<closest:
64+
closest=d
65+
lowestpair=(i,j)
66+
67+
mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
68+
error=closest
69+
newcluster=bicluster(mergevec,left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=error,id=currentclustid)
70+
71+
currentclustid-=1
72+
del clust[lowestpair[1]]
73+
del clust[lowestpair[0]]
74+
clust.append(newcluster)
75+
76+
return clust[0]
77+
78+
79+
def kcluster(vecs,distance=pearson,k=4):
80+
ranges=[(min([vec[i] for vec in vecs]),max([vec[i] for vec in vecs])) for i in range(len(vecs[0]))]
81+
clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(vecs[0]))] for j in range(k)]
82+
83+
lastmatches=None
84+
for t in range(100):
85+
print 'Iteration %d' % t
86+
bestmatches=[[] for i in range(k)]
87+
88+
for j in range(len(vecs)):
89+
vec=vecs[j]
90+
bestmatch=0
91+
for i in range(k):
92+
d=distance(clusters[i],vec)
93+
if d<distance(clusters[bestmatch],vec): bestmatch=i
94+
bestmatches[bestmatch].append(j)
95+
96+
if bestmatches==lastmatches: break
97+
lastmatches=bestmatches
98+
99+
for i in range(k):
100+
avgs=[0.0]*len(vecs[0])
101+
if len(bestmatches[i])>0:
102+
for vecid in bestmatches[i]:
103+
for m in range(len(vecs[vecid])):
104+
avgs[m]+=vecs[vecid][m]
105+
for j in range(len(avgs)):
106+
avgs[j]/=len(bestmatches[i])
107+
clusters[i]=avgs
108+
109+
return bestmatches
110+
111+
def readfile(filename):
112+
lines=[line for line in file(filename)]
113+
colnames=lines[0].strip().split('\t')[1:]
114+
rownames=[]
115+
data=[]
116+
for line in lines[1:]:
117+
p=line.strip().split('\t')
118+
rownames.append(p[0])
119+
data.append([float(x) for x in p[1:]])
120+
return rownames,colnames,data
121+
122+
def test2():
123+
rownames,colnames,data=readfile('datafile.txt')
124+
return hcluster(data)
125+
#for i in range(len(rownames)):
126+
# print i,rownames[i]
127+
128+
def distance(v1,v2):
129+
c1,c2,shr=0,0,0
130+
131+
for i in range(len(v1)):
132+
if v1[i]!=0: c1+=1
133+
if v2[i]!=0: c2+=1
134+
if v1[i]!=0 and v2[i]!=0: shr+=1
135+
136+
return float(shr)/(c1+c2-shr)
137+
138+
139+
#test2()
140+
141+
def getheight(clust):
142+
if clust.left==None and clust.right==None: return 1
143+
return getheight(clust.left)+getheight(clust.right)
144+
145+
def getdepth(clust):
146+
if clust.left==None and clust.right==None: return 0
147+
return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
148+
149+
def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
150+
h=getheight(clust)*20
151+
depth=getdepth(clust)
152+
w=1200
153+
scaling=float(w-150)/depth
154+
img=Image.new('RGB',(w,h),(255,255,255))
155+
draw=ImageDraw.Draw(img)
156+
157+
draw.line((0,h/2,10,h/2),fill=(255,0,0))
158+
159+
drawnode(draw,clust,10,(h/2),scaling,labels)
160+
img.save(jpeg,'JPEG')
161+
162+
def drawnode(draw,clust,x,y,scaling,labels):
163+
if clust.id<0:
164+
h1=getheight(clust.left)*20
165+
h2=getheight(clust.right)*20
166+
top=y-(h1+h2)/2
167+
bottom=y+(h1+h2)/2
168+
169+
ll=clust.distance*scaling
170+
171+
draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
172+
173+
draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
174+
draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
175+
176+
drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
177+
drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
178+
else:
179+
draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0))
180+
181+
def rotatematrix(data):
182+
newdata=[]
183+
for i in range(len(data[0])):
184+
newrow=[data[j][i] for j in range(len(data))]
185+
newdata.append(newrow)
186+
return newdata
187+
188+
def scaledown(data,distance=pearson,rate=0.01):
189+
n=len(data)
190+
realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)]
191+
192+
outersum=0.0
193+
194+
loc=[[random.random(),random.random()] for i in range(n)]
195+
fakedist=[[0.0 for j in range(n)] for i in range(n)]
196+
197+
lasterror=None
198+
for m in range(0,1000):
199+
# Find projected distances
200+
for i in range(n):
201+
for j in range(n):
202+
fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2)
203+
for x in range(len(loc[i]))]))
204+
205+
# Move points
206+
grad=[[0.0,0.0] for i in range(n)]
207+
208+
totalerror=0
209+
for k in range(n):
210+
for j in range(n):
211+
if j==k: continue
212+
errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
213+
grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
214+
grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
215+
totalerror+=abs(errorterm)
216+
print totalerror
217+
if lasterror and lasterror<totalerror: break
218+
lasterror=totalerror
219+
220+
for k in range(n):
221+
loc[k][0]-=rate*grad[k][0]
222+
loc[k][1]-=rate*grad[k][1]
223+
224+
return loc
225+
226+
def draw2d(data,labels,jpg='mds2d.jpg'):
227+
img=Image.new('RGB',(2000,2000),(255,255,255))
228+
draw=ImageDraw.Draw(img)
229+
for i in range(len(data)):
230+
x=(data[i][0]+0.5)*1000
231+
y=(data[i][1]+0.5)*1000
232+
draw.text((x,y),labels[i],(0,0,0))
233+
img.save(jpg,'JPEG')
234+
img.show()

0 commit comments

Comments
 (0)