|
| 1 | +import random |
| 2 | +import math |
| 3 | +from math import sqrt |
| 4 | +from PIL import Image,ImageDraw,ImageFont |
| 5 | + |
| 6 | +# Returns the Pearson correlation coefficient for p1 and p2 |
| 7 | +def pearson(v1,v2): |
| 8 | + # Simple sums |
| 9 | + sum1=sum(v1) |
| 10 | + sum2=sum(v2) |
| 11 | + |
| 12 | + # Sums of the squares |
| 13 | + sum1Sq=sum([pow(v,2) for v in v1]) |
| 14 | + sum2Sq=sum([pow(v,2) for v in v2]) |
| 15 | + |
| 16 | + # Sum of the products |
| 17 | + pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) |
| 18 | + |
| 19 | + # Calculate r (Pearson score) |
| 20 | + num=pSum-(sum1*sum2/len(v1)) |
| 21 | + den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) |
| 22 | + if den==0: return 0 |
| 23 | + |
| 24 | + return 1.0-(num/den) |
| 25 | + |
| 26 | + |
| 27 | +class bicluster: |
| 28 | + def __init__(self,vec,left=None,right=None,distance=0.0,id=None): |
| 29 | + self.left=left |
| 30 | + self.right=right |
| 31 | + self.vec=vec |
| 32 | + self.id=id |
| 33 | + self.distance=distance |
| 34 | + |
| 35 | +def euclidean(v1,v2): |
| 36 | + sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))]) |
| 37 | + return math.sqrt(sqsum) |
| 38 | + |
| 39 | +def printclust(clust,labels=None,n=0): |
| 40 | + for i in range(n): print ' ', |
| 41 | + if clust.id<0: |
| 42 | + print '-' |
| 43 | + else: |
| 44 | + if labels==None: print clust.id |
| 45 | + else: print labels[clust.id] |
| 46 | + if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) |
| 47 | + if clust.right!=None: printclust(clust.right,labels=labels,n=n+1) |
| 48 | + |
| 49 | +def hcluster(vecs,distance=pearson): |
| 50 | + distances={} |
| 51 | + currentclustid=-1 |
| 52 | + clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))] |
| 53 | + |
| 54 | + while len(clust)>1: |
| 55 | + lowestpair=(0,1) |
| 56 | + closest=distance(clust[0].vec,clust[1].vec) |
| 57 | + for i in range(len(clust)): |
| 58 | + for j in range(i+1,len(clust)): |
| 59 | + if (clust[i].id,clust[j].id) not in distances: |
| 60 | + distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) |
| 61 | + d=distances[(clust[i].id,clust[j].id)] |
| 62 | + |
| 63 | + if d<closest: |
| 64 | + closest=d |
| 65 | + lowestpair=(i,j) |
| 66 | + |
| 67 | + mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))] |
| 68 | + error=closest |
| 69 | + newcluster=bicluster(mergevec,left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=error,id=currentclustid) |
| 70 | + |
| 71 | + currentclustid-=1 |
| 72 | + del clust[lowestpair[1]] |
| 73 | + del clust[lowestpair[0]] |
| 74 | + clust.append(newcluster) |
| 75 | + |
| 76 | + return clust[0] |
| 77 | + |
| 78 | + |
| 79 | +def kcluster(vecs,distance=pearson,k=4): |
| 80 | + ranges=[(min([vec[i] for vec in vecs]),max([vec[i] for vec in vecs])) for i in range(len(vecs[0]))] |
| 81 | + clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(vecs[0]))] for j in range(k)] |
| 82 | + |
| 83 | + lastmatches=None |
| 84 | + for t in range(100): |
| 85 | + print 'Iteration %d' % t |
| 86 | + bestmatches=[[] for i in range(k)] |
| 87 | + |
| 88 | + for j in range(len(vecs)): |
| 89 | + vec=vecs[j] |
| 90 | + bestmatch=0 |
| 91 | + for i in range(k): |
| 92 | + d=distance(clusters[i],vec) |
| 93 | + if d<distance(clusters[bestmatch],vec): bestmatch=i |
| 94 | + bestmatches[bestmatch].append(j) |
| 95 | + |
| 96 | + if bestmatches==lastmatches: break |
| 97 | + lastmatches=bestmatches |
| 98 | + |
| 99 | + for i in range(k): |
| 100 | + avgs=[0.0]*len(vecs[0]) |
| 101 | + if len(bestmatches[i])>0: |
| 102 | + for vecid in bestmatches[i]: |
| 103 | + for m in range(len(vecs[vecid])): |
| 104 | + avgs[m]+=vecs[vecid][m] |
| 105 | + for j in range(len(avgs)): |
| 106 | + avgs[j]/=len(bestmatches[i]) |
| 107 | + clusters[i]=avgs |
| 108 | + |
| 109 | + return bestmatches |
| 110 | + |
| 111 | +def readfile(filename): |
| 112 | + lines=[line for line in file(filename)] |
| 113 | + colnames=lines[0].strip().split('\t')[1:] |
| 114 | + rownames=[] |
| 115 | + data=[] |
| 116 | + for line in lines[1:]: |
| 117 | + p=line.strip().split('\t') |
| 118 | + rownames.append(p[0]) |
| 119 | + data.append([float(x) for x in p[1:]]) |
| 120 | + return rownames,colnames,data |
| 121 | + |
| 122 | +def test2(): |
| 123 | + rownames,colnames,data=readfile('datafile.txt') |
| 124 | + return hcluster(data) |
| 125 | + #for i in range(len(rownames)): |
| 126 | + # print i,rownames[i] |
| 127 | + |
| 128 | +def distance(v1,v2): |
| 129 | + c1,c2,shr=0,0,0 |
| 130 | + |
| 131 | + for i in range(len(v1)): |
| 132 | + if v1[i]!=0: c1+=1 |
| 133 | + if v2[i]!=0: c2+=1 |
| 134 | + if v1[i]!=0 and v2[i]!=0: shr+=1 |
| 135 | + |
| 136 | + return float(shr)/(c1+c2-shr) |
| 137 | + |
| 138 | + |
| 139 | +#test2() |
| 140 | + |
| 141 | +def getheight(clust): |
| 142 | + if clust.left==None and clust.right==None: return 1 |
| 143 | + return getheight(clust.left)+getheight(clust.right) |
| 144 | + |
| 145 | +def getdepth(clust): |
| 146 | + if clust.left==None and clust.right==None: return 0 |
| 147 | + return max(getdepth(clust.left),getdepth(clust.right))+clust.distance |
| 148 | + |
| 149 | +def drawdendrogram(clust,labels,jpeg='clusters.jpg'): |
| 150 | + h=getheight(clust)*20 |
| 151 | + depth=getdepth(clust) |
| 152 | + w=1200 |
| 153 | + scaling=float(w-150)/depth |
| 154 | + img=Image.new('RGB',(w,h),(255,255,255)) |
| 155 | + draw=ImageDraw.Draw(img) |
| 156 | + |
| 157 | + draw.line((0,h/2,10,h/2),fill=(255,0,0)) |
| 158 | + |
| 159 | + drawnode(draw,clust,10,(h/2),scaling,labels) |
| 160 | + img.save(jpeg,'JPEG') |
| 161 | + |
| 162 | +def drawnode(draw,clust,x,y,scaling,labels): |
| 163 | + if clust.id<0: |
| 164 | + h1=getheight(clust.left)*20 |
| 165 | + h2=getheight(clust.right)*20 |
| 166 | + top=y-(h1+h2)/2 |
| 167 | + bottom=y+(h1+h2)/2 |
| 168 | + |
| 169 | + ll=clust.distance*scaling |
| 170 | + |
| 171 | + draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0)) |
| 172 | + |
| 173 | + draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0)) |
| 174 | + draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0)) |
| 175 | + |
| 176 | + drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels) |
| 177 | + drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels) |
| 178 | + else: |
| 179 | + draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0)) |
| 180 | + |
| 181 | +def rotatematrix(data): |
| 182 | + newdata=[] |
| 183 | + for i in range(len(data[0])): |
| 184 | + newrow=[data[j][i] for j in range(len(data))] |
| 185 | + newdata.append(newrow) |
| 186 | + return newdata |
| 187 | + |
| 188 | +def scaledown(data,distance=pearson,rate=0.01): |
| 189 | + n=len(data) |
| 190 | + realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)] |
| 191 | + |
| 192 | + outersum=0.0 |
| 193 | + |
| 194 | + loc=[[random.random(),random.random()] for i in range(n)] |
| 195 | + fakedist=[[0.0 for j in range(n)] for i in range(n)] |
| 196 | + |
| 197 | + lasterror=None |
| 198 | + for m in range(0,1000): |
| 199 | + # Find projected distances |
| 200 | + for i in range(n): |
| 201 | + for j in range(n): |
| 202 | + fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) |
| 203 | + for x in range(len(loc[i]))])) |
| 204 | + |
| 205 | + # Move points |
| 206 | + grad=[[0.0,0.0] for i in range(n)] |
| 207 | + |
| 208 | + totalerror=0 |
| 209 | + for k in range(n): |
| 210 | + for j in range(n): |
| 211 | + if j==k: continue |
| 212 | + errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] |
| 213 | + grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm |
| 214 | + grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm |
| 215 | + totalerror+=abs(errorterm) |
| 216 | + print totalerror |
| 217 | + if lasterror and lasterror<totalerror: break |
| 218 | + lasterror=totalerror |
| 219 | + |
| 220 | + for k in range(n): |
| 221 | + loc[k][0]-=rate*grad[k][0] |
| 222 | + loc[k][1]-=rate*grad[k][1] |
| 223 | + |
| 224 | + return loc |
| 225 | + |
| 226 | +def draw2d(data,labels,jpg='mds2d.jpg'): |
| 227 | + img=Image.new('RGB',(2000,2000),(255,255,255)) |
| 228 | + draw=ImageDraw.Draw(img) |
| 229 | + for i in range(len(data)): |
| 230 | + x=(data[i][0]+0.5)*1000 |
| 231 | + y=(data[i][1]+0.5)*1000 |
| 232 | + draw.text((x,y),labels[i],(0,0,0)) |
| 233 | + img.save(jpg,'JPEG') |
| 234 | + img.show() |
0 commit comments