forked from idiap/DocRec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadWord1.m
executable file
·111 lines (102 loc) · 11.2 KB
/
readWord1.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
% Read Word.
% Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/
% Written by Maryam Habibi <[email protected]> or <[email protected]>
% This file is part of the DocRec software.
% DocRec is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License version 3 as
% published by the Free Software Foundation.
% DocRec is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
% You should have received a copy of the GNU General Public License
% along with DocRec. If not, see <http://www.gnu.org/licenses/>.
function [X wP word] = readWord1(path,FName)
%stopword list
stopwords_cellstring={'hmmm','talk','wait','happily','si','erm','talk','worry','worried','worring','days','waits','waiting','a', 'about', 'above', 'above', 'across', 'after', ...
'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', ...
'already', 'also','although','always','am','among', 'amongst', 'amoungst', ...
'amount', 'an', 'and', 'another', 'any','anyhow','anyone','anything','anyway', ...
'anywhere', 'are', 'around', 'as', 'at', 'back','be','became', 'because','become',...
'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below',...
'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom','but', 'by',...
'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de',...
'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight',...
'either', 'eleven','else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', ...
'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify',...
'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found',...
'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt',...
'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', ...
'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'ie', 'if',...
'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last',...
'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile',...
'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',...
'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine',...
'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off',...
'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise',...
'our', 'ours', 'ourselves', 'out', 'over', 'own','part', 'per', 'perhaps', 'please',...
'put', 'rather', 're', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious',...
'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so',...
'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', ...
'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 'them',...
'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', ...
'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third', 'this', 'those',...
'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',...
'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up',...
'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when',...
'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein',...
'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',...
'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet',...
'you', 'your', 'yours', 'yourself', 'yourselves', 'the','laughter','day','true','easy','wrong','mhmm','xxx','xx','x','-----','xxxx','why','err','er','person','people','general','lot','er','yup','agree','therell','unless','sorry','how','dont','doesnt','wont','which','where','shouldnt','isnt','arent','werent','wasnt','wouldnt','didnt','havent','hasnt','stuff','none','theyre','theyve','Ive','what','california','david','rose','wanna','gotta','yep','''Kay','kay','''Cause','cause','Yep','Yeah','Mm-hmm','''kay','''couse','_sentence_start', '_sentence_stop', 'uh-huh', 'uh huh', 'uh', 'huh', 'can', 'just', 'would','its', 'it''s', 'we''re','were','im', 'I''m','well','we''ll','lets', 'Let''s','i', 'I', 'the', 'it', 'we', 'that', 'like', 'of', 'to', 'yeah', 'a', 'in', 'and','you', 'your', 'for', 'is', 'on', 'by', 'be', 'with', 'do', 'will', 'how', 'are', 'what', 'from', 'or', 'as', 'this', 'have', 'um', 'so', 'okay', 'mm-hmm', 'gonna', 'at', 'there''s','theres', 'they''re','theyre', 'but', '?', 'I''ll','i''ll','ill', 'that''s','thats', 'mm', 'yep', 'ah', 'lemme', 'you''re','youre', 'you''ll','youll', 'didn''t','didnt','dont', 'don''t', '''cause', 's', 've', 't','m', 'no', 'oh', 're', 'if', 'wow', 'cause', 'hmm', '[gap]', 'kay', 'my', 'am', 'yes', 'wasn', 'kinda', 'll', 'alright', 'they', 'an', 'uh', 'was', 'some', 'which', 'then', 'into', 'them', 'our', 'it''ll','itll', 'go', 'goes', 'me', 'again', 'we''ve','weve', 'now', 'up', 'few', 'quite', '''kay', 'em','''em', 'punctuation', 'about', 'anything', 'Hi','cant', 'Can''t', 'can''t', 'well', 'all', 'other', 'David', 'Andrew','isnt', 'isn''t', 'a', 'above', 'across', 'after', 'again', 'against', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone','anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be', 'became','because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both', 'but', 'by', 'c', 'came','can', 'cannot', 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', 'did', 'differ', 'different', 'differently','do', 'does', 'done', 'down', 'downed', 'downing', 'downs', 'during', 'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone','everything', 'everywhere', 'felt', 'few', 'find', 'finds', 'first', 'four', 'full', 'fully', 'further', 'furthered', 'furthering', 'furthers', 'gave', 'general', 'generally', 'get', 'gets', 'give', 'given','gives', 'go', 'going', 'good', 'goods', 'got', 'great', 'greater', 'greatest', 'group', 'has', 'had', 'have', 'having', 'her', 'he', 'here', 'herself','high', 'higher', 'highest', 'how', 'however', 'important', 'interest', 'interested', 'interesting', 'interests', 'into', 'its', 'itself', 'just', 'keep', 'keeps', 'knew', 'know', 'Known', 'knows', 'large', 'largely', 'last', 'later', 'latest', 'least', 'him', 'himself', 'his', 'i', 'however', 'if', 'my', 'less', 'let', 'lets', 'like', 'likely', 'long', 'longer', 'longest', 'making', 'man', 'many', 'may', 'me', 'member', 'members', 'men', 'might', 'more', 'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'necessary', 'need', 'needed', 'needing', 'else', 'same', 'thing', 'too', 'much', 'anyway', 'more', 'who''s','whos', 'he''s','hes', 'doesn''t','doesnt','hell', 'he''ll', 'maybe', 'Maybe', 'I''ve', 'aren''t', 'won''t','wont', 'they''ve','itd', 'It''d','thatll', 'that''ll', 'they''ll','theyll', 'wasn''t', 'what''s', 'where''s', 'where''d','whered','thatd', 'that''d', 'hadn''t','hadnt', 'thing''ll','thingll', 'couldnt','couldn''t', 'who''s','havent', 'haven''t', 'doesn''t', 'wouldn''t','wouldnt','id', 'I''d','theyd', 'they''d','wed', 'we''d', 'W','shouldn''t', 'you''d','youd', 'one''s', 'youve','you''ve', 'haven''t', 'everybody''s', 'should', 'one', 'think', 'want', 'wanna', 'kind', 'bit', 'actually', 'made', 'make', 'makes', 'myself', 'never', 'needs', 'new', 'newer', 'newest', 'next', 'no', 'nobody', 'non', 'noone', 'not', 'nothing', 'now', 'nowhere', 'number', 'numbers', 'of', 'off', 'often', 'old', 'older', 'oldest', 'on', 'once', 'one', 'only', 'open', 'opened', 'opening', 'opens', 'or','order', 'ordered', 'ordering', 'orders', 'others', 'our', 'out', 'over', 'part', 'parted', 'parting', 'parts', 'per', 'perhaps', 'place', 'places', 'point', 'pointed', 'pointing', 'points', 'possible', 'present','presented', 'presenting', 'presents', 'problem', 'problems', 'put', 'puts', 'quite', 'rather', 'really', 'right', 'room', 'rooms', 'said', 'same', 'saw', 'say', 'says', 'second', 'seconds', 'see', 'seem', 'seemed', 'seeming', 'seems', 'sees', 'several', 'shall', 'show', 'showed', 'showing', 'shows', 'side', 'sides', 'since', 'small', 'smaller', 'smallest', 'so', 'some', 'somebody', 'someone', 'something', 'somewhere', 'state', 'states', 'still', 'such', 'sure', 'take', 'taken', 'than', 'that', 'their', 'then', 'them' , 'there', 'therefore', 'these', 'they', 'thing', 'things', 'think', 'thinks', 'this', 'those', 'though', 'thought', 'thoughts', 'through', 'thus', 'today', 'together', 'too', 'took', 'toward', 'turn', 'turned', 'turning', 'turns', 'under', 'until', 'up', 'upon', 'us', 'use', 'used', 'uses', 'very', 'want', 'wanted', 'wanting', 'wants', 'was', 'way', 'ways', 'we', 'well', 'wells', 'went', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 'who', 'whole', 'whose', 'why', 'will', 'with', 'within', 'without','work', 'worked', 'working', 'works', 'would', 'year', 'years', 'yet', 'you', 'young', 'dunno', 'th', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','younger', 'youngest', 'your', 'yours', 'doing', 'look', 'minutes', 'little', 'guess', 'thinking'};
%load topic models
topic_word_prob = load('/path/to/twp.mat');
% load dictionary library with the number of their occurence in the topic space
directory = load('/path/to/datawiki1-100.mat');
%contains the list of words
Sw=size(directory.wordsw,1);
%initialize an input file with dictionary size
input=sparse(1,Sw);
for countfile=1:1
%read input file
fid11=fopen([path '/transcripts/' FName ],'r');
%remove stopwords and cleaning
if fid11 > 0
numbers = textscan(fid11,'%s','whitespace','{,}[]() ');
fclose(fid11);
number = numbers{:};
temp2=lower(number);
split1 = regexp(temp2,'\s','Split');
split2=split1{:};
Sp = strjoin(split2(~ismember(split2,stopwords_cellstring)),' ');
SpH=regexp(Sp,'\s','Split');
[rx,ry]=size(SpH);
H=reshape(SpH,ry,rx);
end
%represent words with topic information
IndexH1=[];
word=directory.wordsw;
[xh yh]=size(H);
for i=1:xh
Ch=strcmp(word,H(i));
[a b]=max(Ch);
if (sum(Ch)==0 | ~isempty(find(IndexH1==b)))
IndexH1(i)=-1;
else
[M,IndexH1(i)]=max(Ch);
Freq(i)=sum(strcmp(H,H(i)));
end
end
s=0;
for i=1:xh
if IndexH1(i)==-1;
else
s=s+1;
D(s)=word(IndexH1(i));
F(s)=Freq(i);
input(countfile,IndexH1(i))=Freq(i);
end
end
end
X = input;
wP = topic_word_prob.twp;
end