jtmart
diff --git a/‎Convert_tcpd.py
+117 b/‎Convert_tcpd.py
+117
diff --git a/‎MatchID_fast.py
+59 b/‎MatchID_fast.py
+59
diff --git a/‎MatchID_tuned.py
+46 b/‎MatchID_tuned.py
+46
diff --git a/‎edit_final1_tuned.py
+27 b/‎edit_final1_tuned.py
+27
diff --git a/‎final_processing_tuned.py
+27 b/‎final_processing_tuned.py
+27
diff --git a/‎replacesnumberingsfiveyearplans.py
+34 b/‎replacesnumberingsfiveyearplans.py
+34
diff --git a/‎shivaniutf8.py
+11 b/‎shivaniutf8.py
+11
diff --git a/‎star_tuned.py
+57 b/‎star_tuned.py
+57
@@ -0,0 +1,117 @@
+import pandas as pd
+import os
+import random
+import datetime
+import re
+
+Sessions = [13,14,15,16]
+Govts = ['nda1','upa1','upa2','nda2']
+
+Questions = pd.read_csv('./TCPD_QH.csv', sep=',', index_col=False, encoding='utf-8', error_bad_lines=False)
+#df = pd.read_csv('Test.csv', header=None, sep='\n')
+#for Row in Questions:
+#	Questions[Row] = Questions[Row].str.split('","', expand=True)
+
+print(list(Questions))
+print(len(list(Questions)))
+print(len(Questions))
+
+PathFinal = os.getcwd()
+No = 1
+
+with open('Metadata_LS.csv','a',encoding='utf-8') as fd:
+	print('File Opened')
+	for Ind in range(4):
+		Session = Sessions[Ind]
+		Govt = Govts[Ind]
+		print(Govt,Session)
+		QOld = pd.read_csv('./ls'+str(Session)+'_Q.csv', sep=',', index_col=False, encoding='utf-8')
+		for Row in range(len(Questions)):
+		
+			if (Questions.loc[Questions.index[Row],'ls_number'] == Session):
+
+				Text = Questions.loc[Questions.index[Row],'answer_text']
+				Question = Questions.loc[Questions.index[Row],'question_text']
+				if (Question!="URL_Not_Found"):
+					Date = Questions.loc[Questions.index[Row],'date']
+					QMinistry = Questions.loc[Questions.index[Row],'ministry']
+					print(Date,Row,Text)
+					try:
+						DateFull = datetime.datetime.strptime(Date,'%Y-%m-%dT00:00:00').strftime('%Y/%m/%d')
+					except ValueError:
+						DateFull = datetime.datetime.strptime(Date,'%Y- %m- %dT00:00:00').strftime('%Y/%m/%d')
+					# except KeyError:
+					# 	BT+=1
+					# 	DateFull = datetime.datetime.strptime(Questions.loc[Questions['ID']==CID,'date'][Row-BT],'%d.%m.%Y').strftime('%Y/%m/%d')
+					# 	Text = Answers.loc[Answers['ID']==CID,'Answers'][Row-BT]
+					# 	QLink = Questions.loc[Questions['ID']==CID,'Q_Link'][Row-BT]
+					# 	Question = Questions.loc[Questions['ID']==CID,'Question'][Row-BT]
+					# 	QMinistry = Questions.loc[Questions['ID']==CID,'ministry'][Row-BT]
+					# 	Details = Questions.loc[Questions['ID']==CID,'subject'][Row-BT]
+					
+					DateFull = DateFull.split('/')
+					
+					Token = str(random.randint(0,999999))
+					if (len(Token)<6):
+						Token = '0'*(6-len(Token)) + Token
+
+					Year = DateFull[0]
+					Month = Year + DateFull[1]
+					Day = Month + DateFull[2]
+
+					Id = 't'+Day+Token
+					FileName = Id+".txt"
+					
+					try:
+						OB = Text.find('(')
+						CB = Text.find(')')
+					#Designation = Text[0:OB]
+						Loc = Text[OB+1:CB]
+						Answer = Text[CB+1:]
+						if (Loc=='a'):
+							Loc = 'na'
+							Answer = Text[OB+1:]
+						
+						#Answer = str(Answer.encode("utf-8"))
+
+						QID = str(Questions.loc[Questions.index[Row],'id'])
+
+						QLink = QOld.loc[QOld['ID']==QID,'Q_Link']
+						
+						Details = QOld.loc[QOld['ID']==QID,'subject']
+						QLoc = Questions.loc[Questions.index[Row],'member']
+						Party = Questions.loc[Questions.index[Row],'party']
+						Const = Questions.loc[Questions.index[Row],'constituency']
+						State = Questions.loc[Questions.index[Row],'state']
+						Sex = Questions.loc[Questions.index[Row],'gender']
+						ConstType = Questions.loc[Questions.index[Row],'constituency_type']
+						
+						ConcatStr = Details+'/'+QLoc+'/'+Party+'/'+Const+'/'+State+'/'+Sex+'/'+ConstType
+						
+						#Question = Question.replace(';',' ')
+
+						Answer = re.sub(r'\.+', ".", Answer)
+						Answer = re.sub(r'\.\s+', ".", Answer)
+						Answer = re.sub(r'\.+', ".", Answer)
+
+						Answer = re.sub(r'\*+', "", Answer)
+						Answer = re.sub(r'\*\s+', "", Answer)
+						Answer = re.sub(r'\*+', "", Answer)
+
+						Question = re.sub(r'\.+', ".", Question)
+						Question = re.sub(r'\.\s+', ".", Question)
+						Question = re.sub(r'\.+', ".", Question)
+
+						Question = re.sub(r'\*+', "", Question)
+						Question = re.sub(r'\*\s+', "", Question)
+						Question = re.sub(r'\*+', "", Question)
+
+						f = open(PathFinal+"\\TextFiles\\"+FileName, "a",encoding="utf-8")
+						f.write(Answer)
+						f.close()
+
+						fd.write('\n'+str(Id)+',loksabha,minister,'+str(Year)+','+str(Month)+','+str(Day)+',mp,'+str(Session)+','+Govt+',nationalpol,qanda,na,na,na,"'+str(QMinistry)+'",india,ncr,newdelhi,capital,hindiorenglish,"'+str(ConcatStr)+'","'+str(Question)+'","'+str(QLink)+'",loksabha'+str(Year)+','+str(No)+','+str(No+8637)+',notrans,rahul,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,others,loksabha,others,others,others,others,'+str(Loc)+',yes')
+						print(No)
+						No+=1
+					except AttributeError:
+						continue
@@ -0,0 +1,59 @@
+import os
+import re
+import pandas as pd
+
+LS = pd.read_csv('Metadata_LS.csv', sep=',', index_col=False)
+count = 0
+Verdict = []
+j = 0
+a = 0
+falseout = 0
+
+Files = []
+PathMain = "TextFiles"
+print(PathMain)
+for (dirpath, dirnames, filenames) in os.walk(PathMain):
+	Files.extend(filenames)
+	break
+
+IDs = []
+IDsDict = {}
+
+NotFoundID = []
+NotFoundFile = []
+NewFiles = []
+NewFilesDict = {}
+
+for Row in range(len(LS)):
+	CID = LS.loc[LS.index[Row],'id']
+	IDs.append(CID)
+	try:
+		IDsDict[CID]+= 1
+		NotFoundID.append(CID)
+	except KeyError:
+		IDsDict[CID]=1
+
+print('Number of Files:',len(Files))
+print('Number of IDs:',len(IDs))
+
+for File in Files:
+	FileName = File[:len(File)-4]
+	NewFiles.append(FileName)
+	try:
+		NewFilesDict[FileName]+= 1
+		NotFoundFile.append(FileName)
+	except KeyError:
+		NewFilesDict[FileName]=1
+	#if FileName not in IDsDict.keys():
+	#	NotFoundFile.append(FileName)
+
+#for ID in IDs:
+#	if ID not in NewFilesDict.keys():
+#		NotFoundID.append(ID)
+
+print('Files Checked:',len(NewFiles))
+print('IDs Checked:',len(IDs))
+print('IDs Not Found:',len(NotFoundID))
+print('Files Not Found:',len(NotFoundFile))
+print(NotFoundID)
+print(NotFoundFile)
@@ -0,0 +1,46 @@
+import os
+import re
+import pandas as pd
+
+folder_name=['loksabha']
+
+Members = pd.read_csv('loksabha.csv', sep=',', index_col=False, encoding='iso-8859-1')
+count=0
+Verdict=[]
+j=0
+a=0
+falseout = 0
+
+for i in range(len(folder_name)):
+	j+=1
+	Files = []
+	PathMain = str(folder_name[i])
+	print(PathMain)
+	for (dirpath, dirnames, filenames) in os.walk(PathMain):
+		Files.extend(filenames)
+		break
+
+	IDs = []
+
+	Verdict = []
+
+	for Row in range(len(Members)):
+		CID = Members.loc[Members.index[Row],'id']
+		IDs.append(CID)
+
+	for File in Files:
+		FileName = File[:len(File)-4]
+		a+=1
+		if FileName in IDs:
+			Verdict.append('True for '+FileName)
+		else:
+			Verdict.append('False for '+FileName)
+			print('Error at '+FileName)
+			falseout +=1
+	count+=len(Verdict)
+	#with open('false_output.txt','w',encoding='utf-8') as f:
+	#	f.write('\n'.join(Verdict))
+print('Files Checked - ',a)
+print('Verdict Found - ',count)
+print('Folders Checked - ',j)
+print('Error in Files - ',falseout)
@@ -0,0 +1,27 @@
+import os
+import re
+
+
+
+#PathOfCode = os.getcwd()
+folder_name="all" #keep the python script outside of the target folder (in this case the folder is called all)
+Files = []
+PathMain = folder_name 
+print(PathMain)
+for (dirpath, dirnames, filenames) in os.walk(PathMain):
+	Files.extend(filenames)
+	print(Files)
+
+for File in Files:
+	WFile = open(PathMain + "\\" + File,'r',encoding='utf-8') #this removes target characters, 'utf-8' or 'latin-1'
+	line = WFile.read()
+	WFile.close()
+	line_replaced = re.sub(r'\.+', ".", line) #two or more dots, reads line by line
+	line_replaced = re.sub(r'\.\s+', ".", line_replaced) #empty space between two dots
+	line_replaced = re.sub(r'\.+', ".", line_replaced) 
+	line_replaced = re.sub(r'\*+', "", line_replaced)
+	line_replaced = re.sub(r'\*\s+', "", line_replaced)
+	line_replaced = re.sub(r'\*+', "", line_replaced)
+	WFile = open(PathMain+"\\"+File,'w',encoding='utf-8')
+	WFile.write(line_replaced)
+	WFile.close()
@@ -0,0 +1,27 @@
+import pandas as pd
+import os
+
+dataf = pd.read_csv("metadata.csv") #csv file
+txt_files_to_be_removed=[] #script needs to be inside the folder which has txt files and metadata
+
+year_to_be_removed=['independencerepublicday'] # WHAT IS TO BE REMOVED year_to_be_removed does not matter if numerical: [1916,1917] , if string ['nehru','gandhi']
+index_to_be_removed=[]
+for i in range(len(dataf)):
+	if(dataf.iloc[i,12] in year_to_be_removed):  #in python, column numbers start with 0. [i,2] matlab column 3 in the CSV
+		txt_files_to_be_removed.append(dataf.iloc[i,0])
+		index_to_be_removed.append(i)
+print(txt_files_to_be_removed)
+print(index_to_be_removed)
+dataf.drop(index=index_to_be_removed,inplace=True)
+dataf.reset_index(drop=True,inplace=True)
+#print(dataf)
+dataf.to_csv("new_combined.csv",encoding="utf-8") #it does need alternative encoding
+
+fin=[]
+for i in range(len(txt_files_to_be_removed)):
+	fin.append(txt_files_to_be_removed[i] + ".txt")
+
+for i in fin:
+	os.remove(i)
+
+
@@ -0,0 +1,34 @@
+import os
+import re
+
+
+
+#PathOfCode = os.getcwd()
+folder_name="list-wise-nonum" #keep the python script outside of the target folder (in this case the folder is called all)
+Files = []
+PathMain = folder_name 
+print(PathMain)
+for (dirpath, dirnames, filenames) in os.walk(PathMain):
+	Files.extend(filenames)
+	print(Files)
+
+for File in Files:
+	print(File)
+	WFile = open(PathMain + "\\" + File,'r',encoding='utf-8') #this removes target characters, 'utf-8' or 'latin-1', try to open files without encoding ",encoding='utf-8'" and rerun
+	line = WFile.read()
+	WFile.close()
+	#line_replaced = re.sub(r'\.+', ".", line) #two or more dots, reads line by line
+	#line_replaced = re.sub(r'\.\s+', ".", line_replaced) #empty space between two dots
+	#line_replaced = re.sub(r'\.+', ".", line_replaced) 
+	#line_replaced = re.sub(r'\*+', "", line_replaced)
+	#line_replaced = re.sub(r'\*\s+', "", line_replaced)
+	#line_replaced = re.sub(r'\*+', "", line_replaced)
+	line_replaced = re.sub(r'([0-9]+)(\.)(\n)', "", line)
+	line_replaced = re.sub(r'([0-9]+)(\.)(\s)', "", line_replaced)
+		#line_replaced = re.sub(r'([A-Z]+)(\.)(\s)', "", line_replaced)  #fix is here: it re-adds the space btw dots and next letter
+	#line_replaced = re.sub(r'(\.)([0-9]+)(\.)', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
+	#line_replaced = re.sub(r'(\.)([0-9]+)(\))', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
+	#line_replaced = re.sub(r'(\.)([a-z])(\))', r"\1 \2\3", line_replaced) #fix is here: it re-adds the space btw dots and next letter
+	WFile = open(PathMain+"\\"+File,'w',encoding='utf-8')
+	WFile.write(line_replaced)
+	WFile.close()
@@ -0,0 +1,11 @@
+import os;
+import sys;
+filePathSrc="C:\\Users\\jtmartelli\\Google Drive\\Textual_analysis\\R\\ambedkargandhiplus\\merge\\beta1.1\\corpus\\shivaniutf8"
+for root, dirs, files in os.walk(filePathSrc):
+    for fn in files:
+      if fn[-4:] == '.txt': #or fn[-4:] == '.csv':
+        notepad.open(root + "\\" + fn)
+        console.write(root + "\\" + fn + "\r\n")
+        notepad.runMenuCommand("Encoding", "Convert to UTF-8")
+        notepad.save()
+        notepad.close()
@@ -0,0 +1,57 @@
+#-*- coding: utf-8 -*-
+import csv
+from bs4 import BeautifulSoup
+#import urllib.request
+from subprocess import call
+#import pyperclip
+from textblob import TextBlob
+import numpy as np
+import collections
+
+filename='new_combined.csv' #the script has to be placed outside of the folder, but metadata.csv has to be outside
+
+string=' '
+count=0
+
+latin_arr=[]
+with open(filename, 'rb') as f:
+    reader = csv.reader(f)
+    #your_list = list(reader)
+    #print(your_list)
+
+counter=1
+simp=0
+with open(filename,'r',encoding='utf-8') as f:
+    reader = csv.reader(f)
+    for row in reader: #For skipping the first row i.e. the one which contains the variable names
+        if(count==0):
+            count=1
+            continue
+        temp='**** '
+        #print (len(row))
+        columns=len(row)
+        for i in range (0,columns):
+            if(row[i]==' ' or row[i]==''):
+                row[i]='x'
+            temp=temp+'*var{}_'.format(i+1)+str(row[i])+' '
+        #print (row[0])
+        #fi = open('corpus/{}.txt'.format(your_list[counter][0]), 'rb')
+        fi = open('all/{}.txt'.format(row[0]), 'rb') #folder name
+        counter+=1
+        #print (fi.read())
+        try:
+            string=string+temp+'\n'+fi.read().decode("utf-8") +'\n'
+            #print(string)
+        except:
+            print ("latin")
+            simp+=1
+            latin_arr.append((row[0]))
+            string=string+temp+'\n'+fi.read().decode("latin-1") +'\n'
+
+
+f=open('concat2.txt','w', encoding="utf-8")
+f.write(string)
+print (simp)
+f2=open('latin_files_names.txt','w', encoding="utf-8")
+f2.write(latin_arr)
+print(latin_arr)