{Py} Creating a Multi-level Dictionary for Text Analysis in R
<< This script was written to generate the multi-level (1:5) dictionary used for the submitted article “Aping the People”: A Text Analysis of Populist Mimesis in Narendra Modi Speeches. A copy is available on the SAtextometry repository on GitHub. |
The python script generates the multilevel dictionary (1:5 levels) in both YML and LIWC formats from the CSV: “variables.csv”. An example of required CSV formatting can be found below the code section. Please note that the dictionary attached is a draft only: it differs substantially from the submitted version.
import pandas as pd
import yaml
import csv
import re
def remove_null(alist):
#Removes null from lists
blist = []
for val in alist:
if pd.isna(val) == False:
blist.append(val)
return blist
def make_not_columns(nflevels, levels):
#Gives the rows which are to be dropped
not_rows = []
for i in range(1,nflevels + 1):
if i not in levels:
not_rows.append(i - 1)
return not_rows
def make_and_think(js,arr,level):
#Forms the basic structure of json
if level == len(arr) - 1:
if arr[level] not in js:
js[arr[level]] = []
elif arr[level] not in js:
js[arr[level]] = {}
make_and_think(js[arr[level]],arr,level + 1)
else:
make_and_think(js[arr[level]],arr,level + 1)
return js
def make_liwc(dict_levels,arr):
global head_count
global all_items
global all_headings
item_arr = []
for key in dict_levels:
if key not in all_headings:
all_headings[key] = head_count
if head_count not in item_arr:
item_arr.append(all_headings[key])
head_count += 1
else:
item_arr.append(all_headings[key])
for item in arr:
if item in all_items:
for number in item_arr:
if number not in all_items[item]:
all_items[item].append(number)
else:
all_items[item] = []
for number in item_arr:
if number not in all_items[item]:
all_items[item].append(number)
def makeliwc_comp(filename):
global head_count
global all_items
global all_headings
with open(filename,"a") as file:
file.write('%\n')
for key in all_headings:
file.write(str(key) + "\t" + str(all_headings[key]) + "\n")
file.write('%\n')
for key in all_items.keys():
file.write(str(key) + "\t")
for number in range(len(all_items[key])):
if number == len(all_items[key]) - 1:
file.write(str(all_items[key][number]) + "\n")
else:
file.write(str(all_items[key][number]) + "\t")
def make_and_put(js,arr,value,level):
#Adds an array is the json as value for the pair arr[len(arr) - 1]
if level == len(arr)-1:
if arr[level] in js:
assert(isinstance(js[arr[level]],list))
for v in value:
js[arr[level]].append(v)
if arr[level] in js and level != len(arr)-1:
make_and_put(js[arr[level]],arr,value,level + 1)
return js
def make_dictionary(nflevels = 3 , levels = [1,2,3],filename = "variables.yml",file_to_read = "variables.csv",liwc_filename = "variables_liwc.dic"):
#---------------------------
df = pd.read_csv(file_to_read)
levelsdropped = make_not_columns(nflevels,levels)
df = df.drop(levelsdropped)
nflevelsdropped = len(levelsdropped)
levels_remaining = nflevels - nflevelsdropped
df = df.T
(nfrows,nfcolumns) = df.shape
#---------------------------
#Preprocessing
rows = []
for i in range(nfrows):
row = df.iloc[i]
row = remove_null(row)
rows.append(row)
#---------------------------
#Make Json
js = {}
for row_no in range(0,len(rows) - 1,2):
row = rows[row_no]
c_row = rows[row_no + 1]
level = 0
dict_levels = row[:levels_remaining]
make_and_think(js,dict_levels,0)
arr = []
for item_no in range(levels_remaining,len(row)):
item = row[item_no]
if c_row[item_no] == "nc":
if item == item.lower():
arr.append(item)
arr.append(item[0].capitalize() + item[1:])
else:
arr.append(item)
arr.append(item.lower())
else:
arr.append(item)
make_liwc(dict_levels,arr)
make_and_put(js,dict_levels,arr,0)
#---------------------------------------
#DUMP Json onto yml file and liwc
makeliwc_comp(liwc_filename)
ff = open(filename, 'w+')
yaml.dump(js, ff, allow_unicode=True,default_flow_style=False)
all_headings = {}
all_items = {}
head_count = 1
make_dictionary()
Below is an example of the required csv formatting: ‘c’ stands for case sensitive, ‘nc’ stands for case insensitive. The spreadsheet is accessible here.