Skip to content

Commit be27634

Browse files
committed
workflow
1 parent 7cb32b3 commit be27634

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+15588
-0
lines changed

Convert_tcpd.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import pandas as pd
2+
import os
3+
import random
4+
import datetime
5+
import re
6+
7+
Sessions = [13,14,15,16]
8+
Govts = ['nda1','upa1','upa2','nda2']
9+
10+
Questions = pd.read_csv('./TCPD_QH.csv', sep=',', index_col=False, encoding='utf-8', error_bad_lines=False)
11+
#df = pd.read_csv('Test.csv', header=None, sep='\n')
12+
#for Row in Questions:
13+
# Questions[Row] = Questions[Row].str.split('","', expand=True)
14+
15+
print(list(Questions))
16+
print(len(list(Questions)))
17+
print(len(Questions))
18+
19+
PathFinal = os.getcwd()
20+
No = 1
21+
22+
with open('Metadata_LS.csv','a',encoding='utf-8') as fd:
23+
print('File Opened')
24+
for Ind in range(4):
25+
Session = Sessions[Ind]
26+
Govt = Govts[Ind]
27+
print(Govt,Session)
28+
QOld = pd.read_csv('./ls'+str(Session)+'_Q.csv', sep=',', index_col=False, encoding='utf-8')
29+
for Row in range(len(Questions)):
30+
31+
if (Questions.loc[Questions.index[Row],'ls_number'] == Session):
32+
33+
Text = Questions.loc[Questions.index[Row],'answer_text']
34+
Question = Questions.loc[Questions.index[Row],'question_text']
35+
if (Question!="URL_Not_Found"):
36+
Date = Questions.loc[Questions.index[Row],'date']
37+
QMinistry = Questions.loc[Questions.index[Row],'ministry']
38+
print(Date,Row,Text)
39+
try:
40+
DateFull = datetime.datetime.strptime(Date,'%Y-%m-%dT00:00:00').strftime('%Y/%m/%d')
41+
except ValueError:
42+
DateFull = datetime.datetime.strptime(Date,'%Y- %m- %dT00:00:00').strftime('%Y/%m/%d')
43+
# except KeyError:
44+
# BT+=1
45+
# DateFull = datetime.datetime.strptime(Questions.loc[Questions['ID']==CID,'date'][Row-BT],'%d.%m.%Y').strftime('%Y/%m/%d')
46+
# Text = Answers.loc[Answers['ID']==CID,'Answers'][Row-BT]
47+
# QLink = Questions.loc[Questions['ID']==CID,'Q_Link'][Row-BT]
48+
# Question = Questions.loc[Questions['ID']==CID,'Question'][Row-BT]
49+
# QMinistry = Questions.loc[Questions['ID']==CID,'ministry'][Row-BT]
50+
# Details = Questions.loc[Questions['ID']==CID,'subject'][Row-BT]
51+
52+
DateFull = DateFull.split('/')
53+
54+
Token = str(random.randint(0,999999))
55+
if (len(Token)<6):
56+
Token = '0'*(6-len(Token)) + Token
57+
58+
Year = DateFull[0]
59+
Month = Year + DateFull[1]
60+
Day = Month + DateFull[2]
61+
62+
Id = 't'+Day+Token
63+
FileName = Id+".txt"
64+
65+
try:
66+
OB = Text.find('(')
67+
CB = Text.find(')')
68+
#Designation = Text[0:OB]
69+
Loc = Text[OB+1:CB]
70+
Answer = Text[CB+1:]
71+
if (Loc=='a'):
72+
Loc = 'na'
73+
Answer = Text[OB+1:]
74+
75+
#Answer = str(Answer.encode("utf-8"))
76+
77+
QID = str(Questions.loc[Questions.index[Row],'id'])
78+
79+
QLink = QOld.loc[QOld['ID']==QID,'Q_Link']
80+
81+
Details = QOld.loc[QOld['ID']==QID,'subject']
82+
QLoc = Questions.loc[Questions.index[Row],'member']
83+
Party = Questions.loc[Questions.index[Row],'party']
84+
Const = Questions.loc[Questions.index[Row],'constituency']
85+
State = Questions.loc[Questions.index[Row],'state']
86+
Sex = Questions.loc[Questions.index[Row],'gender']
87+
ConstType = Questions.loc[Questions.index[Row],'constituency_type']
88+
89+
ConcatStr = Details+'/'+QLoc+'/'+Party+'/'+Const+'/'+State+'/'+Sex+'/'+ConstType
90+
91+
#Question = Question.replace(';',' ')
92+
93+
Answer = re.sub(r'\.+', ".", Answer)
94+
Answer = re.sub(r'\.\s+', ".", Answer)
95+
Answer = re.sub(r'\.+', ".", Answer)
96+
97+
Answer = re.sub(r'\*+', "", Answer)
98+
Answer = re.sub(r'\*\s+', "", Answer)
99+
Answer = re.sub(r'\*+', "", Answer)
100+
101+
Question = re.sub(r'\.+', ".", Question)
102+
Question = re.sub(r'\.\s+', ".", Question)
103+
Question = re.sub(r'\.+', ".", Question)
104+
105+
Question = re.sub(r'\*+', "", Question)
106+
Question = re.sub(r'\*\s+', "", Question)
107+
Question = re.sub(r'\*+', "", Question)
108+
109+
f = open(PathFinal+"\\TextFiles\\"+FileName, "a",encoding="utf-8")
110+
f.write(Answer)
111+
f.close()
112+
113+
fd.write('\n'+str(Id)+',loksabha,minister,'+str(Year)+','+str(Month)+','+str(Day)+',mp,'+str(Session)+','+Govt+',nationalpol,qanda,na,na,na,"'+str(QMinistry)+'",india,ncr,newdelhi,capital,hindiorenglish,"'+str(ConcatStr)+'","'+str(Question)+'","'+str(QLink)+'",loksabha'+str(Year)+','+str(No)+','+str(No+8637)+',notrans,rahul,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,loksabha,others,others,others,others,'+str(Loc)+',yes')
114+
print(No)
115+
No+=1
116+
except AttributeError:
117+
continue

MatchID_fast.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import re
3+
import pandas as pd
4+
5+
LS = pd.read_csv('Metadata_LS.csv', sep=',', index_col=False)
6+
count = 0
7+
Verdict = []
8+
j = 0
9+
a = 0
10+
falseout = 0
11+
12+
Files = []
13+
PathMain = "TextFiles"
14+
print(PathMain)
15+
for (dirpath, dirnames, filenames) in os.walk(PathMain):
16+
Files.extend(filenames)
17+
break
18+
19+
IDs = []
20+
IDsDict = {}
21+
22+
NotFoundID = []
23+
NotFoundFile = []
24+
NewFiles = []
25+
NewFilesDict = {}
26+
27+
for Row in range(len(LS)):
28+
CID = LS.loc[LS.index[Row],'id']
29+
IDs.append(CID)
30+
try:
31+
IDsDict[CID]+= 1
32+
NotFoundID.append(CID)
33+
except KeyError:
34+
IDsDict[CID]=1
35+
36+
print('Number of Files:',len(Files))
37+
print('Number of IDs:',len(IDs))
38+
39+
for File in Files:
40+
FileName = File[:len(File)-4]
41+
NewFiles.append(FileName)
42+
try:
43+
NewFilesDict[FileName]+= 1
44+
NotFoundFile.append(FileName)
45+
except KeyError:
46+
NewFilesDict[FileName]=1
47+
#if FileName not in IDsDict.keys():
48+
# NotFoundFile.append(FileName)
49+
50+
#for ID in IDs:
51+
# if ID not in NewFilesDict.keys():
52+
# NotFoundID.append(ID)
53+
54+
print('Files Checked:',len(NewFiles))
55+
print('IDs Checked:',len(IDs))
56+
print('IDs Not Found:',len(NotFoundID))
57+
print('Files Not Found:',len(NotFoundFile))
58+
print(NotFoundID)
59+
print(NotFoundFile)

MatchID_tuned.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
import re
3+
import pandas as pd
4+
5+
folder_name=['loksabha']
6+
7+
Members = pd.read_csv('loksabha.csv', sep=',', index_col=False, encoding='iso-8859-1')
8+
count=0
9+
Verdict=[]
10+
j=0
11+
a=0
12+
falseout = 0
13+
14+
for i in range(len(folder_name)):
15+
j+=1
16+
Files = []
17+
PathMain = str(folder_name[i])
18+
print(PathMain)
19+
for (dirpath, dirnames, filenames) in os.walk(PathMain):
20+
Files.extend(filenames)
21+
break
22+
23+
IDs = []
24+
25+
Verdict = []
26+
27+
for Row in range(len(Members)):
28+
CID = Members.loc[Members.index[Row],'id']
29+
IDs.append(CID)
30+
31+
for File in Files:
32+
FileName = File[:len(File)-4]
33+
a+=1
34+
if FileName in IDs:
35+
Verdict.append('True for '+FileName)
36+
else:
37+
Verdict.append('False for '+FileName)
38+
print('Error at '+FileName)
39+
falseout +=1
40+
count+=len(Verdict)
41+
#with open('false_output.txt','w',encoding='utf-8') as f:
42+
# f.write('\n'.join(Verdict))
43+
print('Files Checked - ',a)
44+
print('Verdict Found - ',count)
45+
print('Folders Checked - ',j)
46+
print('Error in Files - ',falseout)

edit_final1_tuned.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import os
2+
import re
3+
4+
5+
6+
#PathOfCode = os.getcwd()
7+
folder_name="all" #keep the python script outside of the target folder (in this case the folder is called all)
8+
Files = []
9+
PathMain = folder_name
10+
print(PathMain)
11+
for (dirpath, dirnames, filenames) in os.walk(PathMain):
12+
Files.extend(filenames)
13+
print(Files)
14+
15+
for File in Files:
16+
WFile = open(PathMain + "\\" + File,'r',encoding='utf-8') #this removes target characters, 'utf-8' or 'latin-1'
17+
line = WFile.read()
18+
WFile.close()
19+
line_replaced = re.sub(r'\.+', ".", line) #two or more dots, reads line by line
20+
line_replaced = re.sub(r'\.\s+', ".", line_replaced) #empty space between two dots
21+
line_replaced = re.sub(r'\.+', ".", line_replaced)
22+
line_replaced = re.sub(r'\*+', "", line_replaced)
23+
line_replaced = re.sub(r'\*\s+', "", line_replaced)
24+
line_replaced = re.sub(r'\*+', "", line_replaced)
25+
WFile = open(PathMain+"\\"+File,'w',encoding='utf-8')
26+
WFile.write(line_replaced)
27+
WFile.close()

final_processing_tuned.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pandas as pd
2+
import os
3+
4+
dataf = pd.read_csv("metadata.csv") #csv file
5+
txt_files_to_be_removed=[] #script needs to be inside the folder which has txt files and metadata
6+
7+
year_to_be_removed=['independencerepublicday'] # WHAT IS TO BE REMOVED year_to_be_removed does not matter if numerical: [1916,1917] , if string ['nehru','gandhi']
8+
index_to_be_removed=[]
9+
for i in range(len(dataf)):
10+
if(dataf.iloc[i,12] in year_to_be_removed): #in python, column numbers start with 0. [i,2] matlab column 3 in the CSV
11+
txt_files_to_be_removed.append(dataf.iloc[i,0])
12+
index_to_be_removed.append(i)
13+
print(txt_files_to_be_removed)
14+
print(index_to_be_removed)
15+
dataf.drop(index=index_to_be_removed,inplace=True)
16+
dataf.reset_index(drop=True,inplace=True)
17+
#print(dataf)
18+
dataf.to_csv("new_combined.csv",encoding="utf-8") #it does need alternative encoding
19+
20+
fin=[]
21+
for i in range(len(txt_files_to_be_removed)):
22+
fin.append(txt_files_to_be_removed[i] + ".txt")
23+
24+
for i in fin:
25+
os.remove(i)
26+
27+

replacesnumberingsfiveyearplans.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
import re
3+
4+
5+
6+
#PathOfCode = os.getcwd()
7+
folder_name="list-wise-nonum" #keep the python script outside of the target folder (in this case the folder is called all)
8+
Files = []
9+
PathMain = folder_name
10+
print(PathMain)
11+
for (dirpath, dirnames, filenames) in os.walk(PathMain):
12+
Files.extend(filenames)
13+
print(Files)
14+
15+
for File in Files:
16+
print(File)
17+
WFile = open(PathMain + "\\" + File,'r',encoding='utf-8') #this removes target characters, 'utf-8' or 'latin-1', try to open files without encoding ",encoding='utf-8'" and rerun
18+
line = WFile.read()
19+
WFile.close()
20+
#line_replaced = re.sub(r'\.+', ".", line) #two or more dots, reads line by line
21+
#line_replaced = re.sub(r'\.\s+', ".", line_replaced) #empty space between two dots
22+
#line_replaced = re.sub(r'\.+', ".", line_replaced)
23+
#line_replaced = re.sub(r'\*+', "", line_replaced)
24+
#line_replaced = re.sub(r'\*\s+', "", line_replaced)
25+
#line_replaced = re.sub(r'\*+', "", line_replaced)
26+
line_replaced = re.sub(r'([0-9]+)(\.)(\n)', "", line)
27+
line_replaced = re.sub(r'([0-9]+)(\.)(\s)', "", line_replaced)
28+
#line_replaced = re.sub(r'([A-Z]+)(\.)(\s)', "", line_replaced) #fix is here: it re-adds the space btw dots and next letter
29+
#line_replaced = re.sub(r'(\.)([0-9]+)(\.)', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
30+
#line_replaced = re.sub(r'(\.)([0-9]+)(\))', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
31+
#line_replaced = re.sub(r'(\.)([a-z])(\))', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
32+
WFile = open(PathMain+"\\"+File,'w',encoding='utf-8')
33+
WFile.write(line_replaced)
34+
WFile.close()

shivaniutf8.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import os;
2+
import sys;
3+
filePathSrc="C:\\Users\\jtmartelli\\Google Drive\\Textual_analysis\\R\\ambedkargandhiplus\\merge\\beta1.1\\corpus\\shivaniutf8"
4+
for root, dirs, files in os.walk(filePathSrc):
5+
for fn in files:
6+
if fn[-4:] == '.txt': #or fn[-4:] == '.csv':
7+
notepad.open(root + "\\" + fn)
8+
console.write(root + "\\" + fn + "\r\n")
9+
notepad.runMenuCommand("Encoding", "Convert to UTF-8")
10+
notepad.save()
11+
notepad.close()

star_tuned.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#-*- coding: utf-8 -*-
2+
import csv
3+
from bs4 import BeautifulSoup
4+
#import urllib.request
5+
from subprocess import call
6+
#import pyperclip
7+
from textblob import TextBlob
8+
import numpy as np
9+
import collections
10+
11+
filename='new_combined.csv' #the script has to be placed outside of the folder, but metadata.csv has to be outside
12+
13+
string=' '
14+
count=0
15+
16+
latin_arr=[]
17+
with open(filename, 'rb') as f:
18+
reader = csv.reader(f)
19+
#your_list = list(reader)
20+
#print(your_list)
21+
22+
counter=1
23+
simp=0
24+
with open(filename,'r',encoding='utf-8') as f:
25+
reader = csv.reader(f)
26+
for row in reader: #For skipping the first row i.e. the one which contains the variable names
27+
if(count==0):
28+
count=1
29+
continue
30+
temp='**** '
31+
#print (len(row))
32+
columns=len(row)
33+
for i in range (0,columns):
34+
if(row[i]==' ' or row[i]==''):
35+
row[i]='x'
36+
temp=temp+'*var{}_'.format(i+1)+str(row[i])+' '
37+
#print (row[0])
38+
#fi = open('corpus/{}.txt'.format(your_list[counter][0]), 'rb')
39+
fi = open('all/{}.txt'.format(row[0]), 'rb') #folder name
40+
counter+=1
41+
#print (fi.read())
42+
try:
43+
string=string+temp+'\n'+fi.read().decode("utf-8") +'\n'
44+
#print(string)
45+
except:
46+
print ("latin")
47+
simp+=1
48+
latin_arr.append((row[0]))
49+
string=string+temp+'\n'+fi.read().decode("latin-1") +'\n'
50+
51+
52+
f=open('concat2.txt','w', encoding="utf-8")
53+
f.write(string)
54+
print (simp)
55+
f2=open('latin_files_names.txt','w', encoding="utf-8")
56+
f2.write(latin_arr)
57+
print(latin_arr)

0 commit comments

Comments
 (0)