{Py} Extracting OCR text for Metadata Generation: The Example of PaRChA

by J-T.M. · Published July 13, 2019 · Updated July 3, 2020

<< This document summarises the data manipulation necessary to format, name, tag and upload OCRed material of the Pamphlet Repository for Changing Activism (PaRChA) and its metadata. Complementary material is available on the PaRChA repository on GitHub.

PaRChA is a repository of pamphlets, posters, leaflets, manifestos, reports, letters and press releases produced by Indian student organisations of national and regional parties over four decades (1973–2015).

First script

This is ’filter.py‘.

# -*- coding: utf-8 -*-
import os.path
import re
import subprocess
import sys

from string import Template
from collections import OrderedDict

reference_extension = ".txt"

def usage():
    """Display command usage"""
    sys.stderr.write('Usage: %s <dir>\n' % __file__)
    sys.stderr.write('example: %s mydirectory\n' % __file__)
    sys.exit(1)
   

def get_file_content(filename):
    with open(filename, mode='r', encoding='latin-1') as f:
        return f.read()
    

if __name__ == '__main__':
    if len(sys.argv) <= 1:
        usage()

    dir = sys.argv[1]

    #Prepare fake data
    subprocess.call(["rm", "foire/bb.txt"])
    #subprocess.call(["mkdir", "test1"])
    subprocess.call(["cp", "-a", "foire/aa.txt", "foire/bb.txt"])


    txt = get_file_content(dir)

    match = re.compile('[^\w\s\p\,\.\;]')
    cleaned = match.sub(' ', txt)
    print(cleaned)
    

    sys.exit(1)
    inventory_full_filename = os.path.join(dir, inventory_filename)


    with open(inventory_full_filename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile, lineterminator='\n')

        for dirname, dirnames, filenames in os.walk(dir):
            for filename in filenames:            
                if filename.endswith(reference_extension):
                    fullpath = os.path.join(dirname, filename)                  
                    print('')

Second script

This is inventory.py.

# -*- coding: utf-8 -*-
import csv
import hashlib
import json
import os.path
import subprocess
import sys

from string import Template
from collections import OrderedDict

reference_extension = ".jpg"
inventory_filename = "inventory.csv"
seq_id = 1

def usage():
    """Display command usage"""
    sys.stderr.write('Usage: %s <dir>\n' % __file__)
    sys.stderr.write('example: %s mydirectory\n' % __file__)
    sys.exit(1)
   
def sha1OfFile(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.sha1(f.read()).hexdigest()


def writeJson(jsonFilename, data):
    with open(fullpathJson, 'w') as jsonFile:
        json.dump(data, jsonFile)

    

if __name__ == '__main__':
    if len(sys.argv) <= 1:
        usage()

    dir = sys.argv[1]

    #Prepare fake data
    #subprocess.call(["rm", "-r", "test1/"])
    #subprocess.call(["mkdir", "test1"])
    #subprocess.call(["cp", "-a", "testdatadir/", "test1/"])

    inventory_full_filename = os.path.join(dir, inventory_filename)

    with open(inventory_full_filename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile, lineterminator='\n')

        for dirname, dirnames, filenames in os.walk(dir):
            for filename in filenames:            
                if filename.endswith(reference_extension):
                    fullpath = os.path.join(dirname, filename)                  
                    sha1 = sha1OfFile(fullpath) # Generate SHA1 of the source file
                    

                    #Prepare JSON metadata file
                    data = OrderedDict()
                    data['id'] = seq_id
                    data['jpg_filename'] = filename
                    data['path'] = dirname
                    data['jpg_sha1'] = sha1
                    
                    #Define JSON filename : filename.jpg --> filename.json
                    filenameRaw, fileExtension = os.path.splitext(filename)
                    jsonFilename = "{0}.{1}".format(filenameRaw, "json")
                    fullpathJson = os.path.join(dirname, jsonFilename)

                    print("{:<7}\t{}\t{}\t{}".format(seq_id, filename, dirname, sha1)) #Screen log
                    writeJson(fullpathJson, data) #Write a JSON metadata file
                    csvwriter.writerow([seq_id, filename, dirname, sha1]) #add info in Inventory file
                    
                    #Increment the sequence id
                    seq_id += 1

Third script

This is tagall.py.

# -*- coding: utf-8 -*-
import glob
import os.path
import sys
from tagit import tagit

reference_extension = ".jpg"

def usage():
    """Display command usage"""
    sys.stderr.write('Usage: %s <folder> <template>\n' % __file__)
    sys.stderr.write('example: %s myfolder template.xmp\n' % __file__)
    sys.exit(1)


def tagall(folder, template_filename):
    for dirname, dirnames, filenames in os.walk(folder):
        for filename in filenames:            
            if filename.endswith(reference_extension):
                fullpath = os.path.join(dirname, filename) 
                print("tagging: {}".format(fullpath))
                tagit(fullpath, template_filename) 
 

    #picture_filename = 'test1/SFI98-2002part1 2.jpg'
    #tagit(picture_filename) 
    

if __name__ == '__main__':
    if len(sys.argv) <= 2:
        usage()
 
    template_filename = sys.argv[2]
    folder = sys.argv[1]

    tagall(folder, template_filename)

Fourth script

This is tagit.py.

# -*- coding: utf-8 -*-
import json
import os.path
import subprocess
import sys

from string import Template


def usage():
    """Display command usage"""
    sys.stderr.write('Usage: %s <picture> <template> \n' % __file__)
    sys.stderr.write('example: %s pamphlet.jpg template.xmp\n' % __file__)
    sys.exit(1)

def format_template(template, title, text):
    s = Template(template)
    return s.substitute(title=title, description=text)

def get_file_content(filename):
    with open(filename, mode='r', encoding='latin-1') as f:
        return f.read()

def clean_title(title):
    #Split with dash, remove the last element (should be page number)
    #Put back the space dash space instead of dash, to allow
    return " - ".join(title.split("-")[:-1])

def create_meta_data(template_filename, title, description_filename):
    template = get_file_content(template_filename)
    description = get_file_content(description_filename)
   
    return format_template(template, title, description)
   
def prepare_meta_data_file(template_filename, title, description_filename, meta_data_filename):
    meta_data = create_meta_data(template_filename, title, description_filename)
    
    with open(meta_data_filename, mode='w', encoding='utf-8') as f:
        f.write(meta_data)

def add_meta_data_to_picture_file(picture_filename, meta_data_filename):
    subprocess.call(["exiftool", "-all=", picture_filename], stdout=subprocess.DEVNULL)
    subprocess.call(["exiftool", "-tagsfromfile", meta_data_filename, picture_filename], stdout=subprocess.DEVNULL)
    os.remove(meta_data_filename) 


def get_json_info(filename):
   with open(filename) as jsonFile:
        data = json.load(jsonFile)
   
   return data

def tagit(picture_filename, template_filename='templateXMP-ok'):
    dirname = os.path.dirname(picture_filename)
    fileName, fileExtension = os.path.splitext(os.path.basename(picture_filename))
    
    description_filename = os.path.join(dirname, "%s.txt" % (fileName))
    meta_data_filename = os.path.join(dirname, "%s.xmp" % (fileName))
    json_filename = os.path.join(dirname, "%s.json" % (fileName))

    json_data = get_json_info(json_filename)
    path = os.path.normpath(json_data['path'])
    path = path.split(os.sep)
    #path = map(lambda x: '#' + x, path)
    beautiful_path =  " - ".join(path)
    title = "{} ID-{}".format(beautiful_path, json_data['id'])

    print(title)

    #description_filename = 'test1/SFI98-2002part1 2.txt'
    #picture_filename = 'test1/SFI98-2002part1 2.jpg'
    #meta_data_filename = 'test1/SFI98-2002part1 2.xmp'

    prepare_meta_data_file(template_filename, title, description_filename, meta_data_filename)
    add_meta_data_to_picture_file(picture_filename, meta_data_filename)


if __name__ == '__main__':
    #template = '<hello>${description}</hello>'
    #text = 'Ceci est ma description'
    #result = format_template(template, text)
    #print(result)

    #template = get_file_content('templateXMP-ok')
    #print(template)
    
    #description = get_file_content('testdata/SFI98-2002part1 2.txt')
    #print(description)

    #result = format_template(template, description)
    #print(result)


    #exiftool -all= 15299202194_af4f1fb899_o.jpg
    
    #Prepare fake data
    #subprocess.call(["rm", "-r", "test1/"])
    #subprocess.call(["mkdir", "test1"])
    #subprocess.call(["cp", "-a", "testdatadir/", "test1/"])

    if len(sys.argv) <= 2:
        usage()

    #template_filename = 'templateXMP-ok'
    template_filename = sys.argv[2]
    picture_filename = sys.argv[1]

    tagit(picture_filename, template_filename)