#! /usr/bin/env python
#-*- coding: utf-8-*-
# author: vivian voigt
# version: 3.0
#
# 2014-11-16 uwe springmann:
# reflect transciption changes in dipl: 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü'
# additional rules for tildes instead of macrons in dipl
# 2014-11-17 vivian voigt:
# added 'oͦ':'o' to the replacement-list
# added function that checks for macrons at line seperated token
# 2016-01-12 vivian voigt:
# removed replacement of 'í' to 'i', ,'ű' to 'ü', '$' to 'us' and '€' to 'der'
#2016-03-16 laura perlitz:
#added replacements of 'ꝰ' to 'us', 'v̉ ' to 'ü' and 'ð' to 'der'
#2016-04-06 laura perlitz:
#deleted all replacements of macrons
# 2016-05-30 vivian voigt:
# added a list of different forms of "und" and "or" (truncHelper), to differ between words that are seperated because of a linebreak and words that are seperated because of truncs
# 2016-06-02 vivian voigt:
# made the script upward compatible with python 3.x (tested with 3.4.3)
import sys, re
try:
import StringIO
except ImportError:
import io as StringIO
def searchHyphen(str):
return (str[-2] == '-')
def replace_all(text, dic):
for i, j in dic.items():
text = text.replace(i, j)
return text
# list of replacements
reps = {'ꝰ':'us', 'v̉ ':'ü', 'ð':'der', 'ſ':'s', 'ů':'u', 'ů':'u', '⸗':'-', 'æ':'ae', 'Æ':'AE', 'œ':'oe', 'Œ':'OE', 'å':'a', 'aͤ':'ä', 'ͤa':'ä', 'äͤ':'ä', 'oͤ':'ö', 'uͤ':'ü','vͤ':'ü', 'Aͤ':'Ä', 'Oͤ':'Ö', 'Uͤ':'Ü', 'Vͤ':'Ü', '˖':':', 'ʒ':'z', 'ȝ':'z', 'v̂':'ü','o̊':'o','oͦ':'o', 'ꝰ':'us','ꝝ':'rum','d̉':'der','v̉':'ü', '℞':'recipe', '℔':'libra', '℥':'uncia', '℈':'scrupel', 'ÿ':'y', 'dᷣ':'der', 'ꝺᷣ':'der'}
truncHelper = {'und', 'Und', 'oder', 'Oder', 'vnd', 'Vnd', 'vnnd', 'Vnnd', 'vñ', 'Vñ', 'vn̄', 'Vn̄', 'oð', 'Oð', 'odder', 'Odder', 'vn', 'Vn', 'od̉', 'Od̉', 'unnd', 'Unnd', 'undt', 'Undt', 'uñ', 'Uñ', 'vund', 'Vund', 'uud', 'Uud', 'Vud', 'vud', 'nnd'}
def main(argv):
if len(argv) < 3:
print("usage: ./clean.py