{Py} Creating a Multi-level Dictionary for Text Analysis in R

by J-T.M. · Published July 13, 2019 · Updated July 3, 2020

<< This script was written to generate the multi-level (1:5) dictionary used for the submitted article “Aping the People”: A Text Analysis of Populist Mimesis in Narendra Modi Speeches. A copy is available on the SAtextometry repository on GitHub.

The python script generates the multilevel dictionary (1:5 levels) in both YML and LIWC formats from the CSV: “variables.csv”. An example of required CSV formatting can be found below the code section. Please note that the dictionary attached is a draft only: it differs substantially from the submitted version.

import pandas as pd
import yaml 
import csv
import re
def remove_null(alist):
    #Removes null from lists
    blist = []
    for val in alist:
        if pd.isna(val) == False:
            blist.append(val)
    return blist 

def make_not_columns(nflevels, levels):
    #Gives the rows which are to be dropped
    not_rows  = []
    for i in range(1,nflevels + 1):
        if i not in levels:
            not_rows.append(i - 1)
    return not_rows

def make_and_think(js,arr,level):
    #Forms the basic structure of json
    if level ==  len(arr) - 1:
        if arr[level] not in js:
            js[arr[level]] = []
    elif arr[level] not in js:
        js[arr[level]] = {}
        make_and_think(js[arr[level]],arr,level + 1)
    else:
        make_and_think(js[arr[level]],arr,level + 1)
    return js

def make_liwc(dict_levels,arr):
    global head_count
    global all_items
    global all_headings
    item_arr = []
    for key in dict_levels:
        if key not in all_headings:
            all_headings[key] = head_count
            if head_count not in item_arr:
                item_arr.append(all_headings[key])
            head_count += 1
        else:
            item_arr.append(all_headings[key])
    for item in arr:
        if item in all_items:
            for number in item_arr:
                if number not in all_items[item]:
                    all_items[item].append(number)
        else:
            all_items[item] = []
            for number in item_arr:
                if number not in all_items[item]:
                    all_items[item].append(number)

def makeliwc_comp(filename):
    global head_count
    global all_items
    global all_headings
    with open(filename,"a") as file:
        file.write('%\n')
        for key in all_headings:
            file.write(str(key) + "\t" + str(all_headings[key]) + "\n")
        file.write('%\n')
        for key in all_items.keys():
            file.write(str(key) + "\t")
            for number in range(len(all_items[key])):
                if number == len(all_items[key]) - 1:
                    file.write(str(all_items[key][number]) + "\n")
                else:
                    file.write(str(all_items[key][number]) + "\t")


def make_and_put(js,arr,value,level):
    #Adds an array is the json as value for the pair arr[len(arr) - 1] 
    if level == len(arr)-1:
        if arr[level] in js:
            assert(isinstance(js[arr[level]],list))
            for v in value:
                js[arr[level]].append(v)
    if arr[level] in js and level != len(arr)-1:
        make_and_put(js[arr[level]],arr,value,level + 1)
    return js


def make_dictionary(nflevels = 3 , levels = [1,2,3],filename = "variables.yml",file_to_read = "variables.csv",liwc_filename = "variables_liwc.dic"):
    #---------------------------
    df = pd.read_csv(file_to_read)
    levelsdropped = make_not_columns(nflevels,levels)
    df = df.drop(levelsdropped)
    nflevelsdropped = len(levelsdropped)
    levels_remaining  = nflevels - nflevelsdropped
    df = df.T
    (nfrows,nfcolumns) = df.shape
    #---------------------------
    #Preprocessing
    rows = []
    for i in range(nfrows):
        row = df.iloc[i]
        row = remove_null(row)
        rows.append(row) 
    #---------------------------
    #Make Json
    js = {}
    for row_no in range(0,len(rows) - 1,2):
        row = rows[row_no]
        c_row = rows[row_no + 1]
        level = 0
        dict_levels = row[:levels_remaining]
        make_and_think(js,dict_levels,0)
        arr = []
        for item_no in range(levels_remaining,len(row)):
            item = row[item_no]
            if c_row[item_no] == "nc":
                if item == item.lower():
                    arr.append(item)
                    arr.append(item[0].capitalize() + item[1:])
                else:
                    arr.append(item)
                    arr.append(item.lower())
            else:
                arr.append(item)
        make_liwc(dict_levels,arr)
        make_and_put(js,dict_levels,arr,0)
    #---------------------------------------
    #DUMP Json onto yml file and liwc
    makeliwc_comp(liwc_filename)
    ff = open(filename, 'w+')
    yaml.dump(js, ff, allow_unicode=True,default_flow_style=False)
all_headings = {}
all_items = {}
head_count = 1  
make_dictionary()

Below is an example of the required csv formatting: ‘c’ stands for case sensitive, ‘nc’ stands for case insensitive. The spreadsheet is accessible here.