{Py} Extracting OCR text for Metadata Generation: The Example of PaRChA
<< This document summarises the data manipulation necessary to format, name, tag and upload OCRed material of the Pamphlet Repository for Changing Activism (PaRChA) and its metadata. Complementary material is available on the PaRChA repository on GitHub. |
PaRChA is a repository of pamphlets, posters, leaflets, manifestos, reports, letters and press releases produced by Indian student organisations of national and regional parties over four decades (1973–2015).
First script
This is ’filter.py‘.
# -*- coding: utf-8 -*-
import os.path
import re
import subprocess
import sys
from string import Template
from collections import OrderedDict
reference_extension = ".txt"
def usage():
"""Display command usage"""
sys.stderr.write('Usage: %s <dir>\n' % __file__)
sys.stderr.write('example: %s mydirectory\n' % __file__)
sys.exit(1)
def get_file_content(filename):
with open(filename, mode='r', encoding='latin-1') as f:
return f.read()
if __name__ == '__main__':
if len(sys.argv) <= 1:
usage()
dir = sys.argv[1]
#Prepare fake data
subprocess.call(["rm", "foire/bb.txt"])
#subprocess.call(["mkdir", "test1"])
subprocess.call(["cp", "-a", "foire/aa.txt", "foire/bb.txt"])
txt = get_file_content(dir)
match = re.compile('[^\w\s\p\,\.\;]')
cleaned = match.sub(' ', txt)
print(cleaned)
sys.exit(1)
inventory_full_filename = os.path.join(dir, inventory_filename)
with open(inventory_full_filename, 'w') as csvfile:
csvwriter = csv.writer(csvfile, lineterminator='\n')
for dirname, dirnames, filenames in os.walk(dir):
for filename in filenames:
if filename.endswith(reference_extension):
fullpath = os.path.join(dirname, filename)
print('')
Second script
This is inventory.py.
# -*- coding: utf-8 -*-
import csv
import hashlib
import json
import os.path
import subprocess
import sys
from string import Template
from collections import OrderedDict
reference_extension = ".jpg"
inventory_filename = "inventory.csv"
seq_id = 1
def usage():
"""Display command usage"""
sys.stderr.write('Usage: %s <dir>\n' % __file__)
sys.stderr.write('example: %s mydirectory\n' % __file__)
sys.exit(1)
def sha1OfFile(filepath):
with open(filepath, 'rb') as f:
return hashlib.sha1(f.read()).hexdigest()
def writeJson(jsonFilename, data):
with open(fullpathJson, 'w') as jsonFile:
json.dump(data, jsonFile)
if __name__ == '__main__':
if len(sys.argv) <= 1:
usage()
dir = sys.argv[1]
#Prepare fake data
#subprocess.call(["rm", "-r", "test1/"])
#subprocess.call(["mkdir", "test1"])
#subprocess.call(["cp", "-a", "testdatadir/", "test1/"])
inventory_full_filename = os.path.join(dir, inventory_filename)
with open(inventory_full_filename, 'w') as csvfile:
csvwriter = csv.writer(csvfile, lineterminator='\n')
for dirname, dirnames, filenames in os.walk(dir):
for filename in filenames:
if filename.endswith(reference_extension):
fullpath = os.path.join(dirname, filename)
sha1 = sha1OfFile(fullpath) # Generate SHA1 of the source file
#Prepare JSON metadata file
data = OrderedDict()
data['id'] = seq_id
data['jpg_filename'] = filename
data['path'] = dirname
data['jpg_sha1'] = sha1
#Define JSON filename : filename.jpg --> filename.json
filenameRaw, fileExtension = os.path.splitext(filename)
jsonFilename = "{0}.{1}".format(filenameRaw, "json")
fullpathJson = os.path.join(dirname, jsonFilename)
print("{:<7}\t{}\t{}\t{}".format(seq_id, filename, dirname, sha1)) #Screen log
writeJson(fullpathJson, data) #Write a JSON metadata file
csvwriter.writerow([seq_id, filename, dirname, sha1]) #add info in Inventory file
#Increment the sequence id
seq_id += 1
Third script
This is tagall.py.
# -*- coding: utf-8 -*-
import glob
import os.path
import sys
from tagit import tagit
reference_extension = ".jpg"
def usage():
"""Display command usage"""
sys.stderr.write('Usage: %s <folder> <template>\n' % __file__)
sys.stderr.write('example: %s myfolder template.xmp\n' % __file__)
sys.exit(1)
def tagall(folder, template_filename):
for dirname, dirnames, filenames in os.walk(folder):
for filename in filenames:
if filename.endswith(reference_extension):
fullpath = os.path.join(dirname, filename)
print("tagging: {}".format(fullpath))
tagit(fullpath, template_filename)
#picture_filename = 'test1/SFI98-2002part1 2.jpg'
#tagit(picture_filename)
if __name__ == '__main__':
if len(sys.argv) <= 2:
usage()
template_filename = sys.argv[2]
folder = sys.argv[1]
tagall(folder, template_filename)
Fourth script
This is tagit.py.
# -*- coding: utf-8 -*-
import json
import os.path
import subprocess
import sys
from string import Template
def usage():
"""Display command usage"""
sys.stderr.write('Usage: %s <picture> <template> \n' % __file__)
sys.stderr.write('example: %s pamphlet.jpg template.xmp\n' % __file__)
sys.exit(1)
def format_template(template, title, text):
s = Template(template)
return s.substitute(title=title, description=text)
def get_file_content(filename):
with open(filename, mode='r', encoding='latin-1') as f:
return f.read()
def clean_title(title):
#Split with dash, remove the last element (should be page number)
#Put back the space dash space instead of dash, to allow
return " - ".join(title.split("-")[:-1])
def create_meta_data(template_filename, title, description_filename):
template = get_file_content(template_filename)
description = get_file_content(description_filename)
return format_template(template, title, description)
def prepare_meta_data_file(template_filename, title, description_filename, meta_data_filename):
meta_data = create_meta_data(template_filename, title, description_filename)
with open(meta_data_filename, mode='w', encoding='utf-8') as f:
f.write(meta_data)
def add_meta_data_to_picture_file(picture_filename, meta_data_filename):
subprocess.call(["exiftool", "-all=", picture_filename], stdout=subprocess.DEVNULL)
subprocess.call(["exiftool", "-tagsfromfile", meta_data_filename, picture_filename], stdout=subprocess.DEVNULL)
os.remove(meta_data_filename)
def get_json_info(filename):
with open(filename) as jsonFile:
data = json.load(jsonFile)
return data
def tagit(picture_filename, template_filename='templateXMP-ok'):
dirname = os.path.dirname(picture_filename)
fileName, fileExtension = os.path.splitext(os.path.basename(picture_filename))
description_filename = os.path.join(dirname, "%s.txt" % (fileName))
meta_data_filename = os.path.join(dirname, "%s.xmp" % (fileName))
json_filename = os.path.join(dirname, "%s.json" % (fileName))
json_data = get_json_info(json_filename)
path = os.path.normpath(json_data['path'])
path = path.split(os.sep)
#path = map(lambda x: '#' + x, path)
beautiful_path = " - ".join(path)
title = "{} ID-{}".format(beautiful_path, json_data['id'])
print(title)
#description_filename = 'test1/SFI98-2002part1 2.txt'
#picture_filename = 'test1/SFI98-2002part1 2.jpg'
#meta_data_filename = 'test1/SFI98-2002part1 2.xmp'
prepare_meta_data_file(template_filename, title, description_filename, meta_data_filename)
add_meta_data_to_picture_file(picture_filename, meta_data_filename)
if __name__ == '__main__':
#template = '<hello>${description}</hello>'
#text = 'Ceci est ma description'
#result = format_template(template, text)
#print(result)
#template = get_file_content('templateXMP-ok')
#print(template)
#description = get_file_content('testdata/SFI98-2002part1 2.txt')
#print(description)
#result = format_template(template, description)
#print(result)
#exiftool -all= 15299202194_af4f1fb899_o.jpg
#Prepare fake data
#subprocess.call(["rm", "-r", "test1/"])
#subprocess.call(["mkdir", "test1"])
#subprocess.call(["cp", "-a", "testdatadir/", "test1/"])
if len(sys.argv) <= 2:
usage()
#template_filename = 'templateXMP-ok'
template_filename = sys.argv[2]
picture_filename = sys.argv[1]
tagit(picture_filename, template_filename)