-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTermDocumentGeneratorFunction.R
107 lines (82 loc) · 2.88 KB
/
TermDocumentGeneratorFunction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
library(Matrix)
termDocumentGenerator<-function(data_input,text_col,stop_list=NULL,word_freq=10,freq_inc=F){
tmp.list<-NULL
for(n in text_col){
txt_dat1<-data_input[,n]
#remove punctuation, numbers,extra spaces, capital letters
txt_dat1<-gsub("[[:punct:]]", "", txt_dat1)
txt_dat1<-gsub('[[:digit:]]+', '', txt_dat1)
txt_dat1<-gsub(' ',' ',txt_dat1)
txt_dat1<-tolower(txt_dat1)
#create the term by document matrix
#find the words first
words_temp<-NULL
for(i in 1:length(txt_dat1)){
doc.tmp<-strsplit(txt_dat1[i],' ')
for(j in 1:length(doc.tmp[[1]])){
if(length(which(doc.tmp[[1]][j]==words_temp))==0){words_temp<-c(words_temp,doc.tmp[[1]][j])}
}
}
#remove the non unique
words_temp<-unique(words_temp)
#remove words of length 1
ind1<-NULL
for(i in 1:length(words_temp)){
if(is.na(nchar(words_temp[i]))){}
else if(nchar(words_temp[i])>1){ind1<-c(ind1,i)}else{}
}
words_temp<-words_temp[ind1]
#remove the stop words
if(is.null(stop_list)==T){}else{
words_temp_inds<-NULL
for(i in 1:length(words_temp)){
if(length(which(words_temp[i]==stoplist))==0){words_temp_inds<-c(words_temp_inds,i)}
}
words_temp<-words_temp[words_temp_inds]
}
#id the most common words
wordcount_tmp<-matrix(0,nrow=length(words_temp),ncol = 2)
wordcount_tmp[,1]<-words_temp
for(i in 1:length(txt_dat1)){
doc.tmp<-strsplit(txt_dat1[i],' ')
doc.tmp<-doc.tmp[[1]]
for(j in 1:length(doc.tmp)){
if(length(which(doc.tmp[j]==words_temp))>0){
ind<-which(wordcount_tmp[,1] == as.character(doc.tmp[j]))
wordcount_tmp[ind,2]<-as.numeric(wordcount_tmp[ind,2])+1
}
}
}
if(word_freq==0){}else{
wordcount_tmp<-wordcount_tmp[which(as.numeric(wordcount_tmp[,2])>word_freq),]
}
wordcount_tmp<-wordcount_tmp[order(as.numeric(wordcount_tmp[,2]),decreasing = T),]
tdm1<-Matrix(0,ncol=length(txt_dat1),nrow=length(wordcount_tmp[,1]),sparse = TRUE)
colnames(tdm1)<-paste(colnames(data_input)[text_col[1]],seq(1,length(txt_dat1)))
rownames(tdm1)<-wordcount_tmp[,1]
#make the full term by document matrix
for(i in 1:length(txt_dat1)){
doc.tmp<-strsplit(txt_dat1[i],' ')
doc.tmp<-doc.tmp[[1]]
wrd_tmp<-unique(doc.tmp)
for(j in 1:length(wrd_tmp)){
if(length(which(wrd_tmp[j]==words_temp))>0){
ind<-which(wordcount_tmp[,1] == as.character(wrd_tmp[j]))
tdm1[ind,i]<-as.numeric(tdm1[ind,i])+1
}
}
}
tdm1<-t(tdm1)
if(freq_inc==T){
tmp.list<-c(tmp.list,list(name1=tdm1,name2=wordcount_tmp))
#assign(paste(colnames(data_input)[n],'_tdm'),tdm1)
#assign(paste(colnames(data_input)[n],'_freq'),wordcount_tmp)
names(tmp.list)[(length(tmp.list)-1):length(tmp.list)]<-c(paste(colnames(data_input)[n],'_tdm',sep = ''),paste(colnames(data_input)[n],'_freq',sep = ''))
}else{
tmp.list<-c(tmp.list,list(name1=tdm1))
names(tmp.list)[length(tmp.list)]<-c(paste(colnames(data_input)[n],'_tdm',sep = ''))
}
}
return(tmp.list)
}
#A<-termDocumentGenerator(All_Recipes_clean,text_col = c(9,10),stop_list = stoplist)