Sprach- und literaturwissenschaftliche Fakultät - Korpuslinguistik und Morphologie

decca-pos-reduce.py

#!/usr/bin/env python

""" this code has been extended and slightly modified

    original code was written by Markus Dickinson and Detmar Meurers
    -- hildebax, korpling hu-berlin

    additionally to the regular output, a reduced output is generated,
    where n-grams that are contained in any (m>n)-gram are removed.
    removed n-grams are also written to file.

    additional cmd param: -g / --gaps
    if --gaps is given, 'reduced' and 'deleted' files will contain empty rows exactly where
    the other file contains data
"""

# DECCA infomation:
#
# Copyright (C) 2006 Markus Dickinson, Detmar Meurers, Adriane Boyd
# Contact: decca@ling.osu.edu
# URL: http://decca.osu.edu
# License: GNU GPL (http://decca.osu.edu/software/license.html)


""" decca-pos-reduce.py

    Code to calculate all variation n-grams: takes in a corpus in TnT  
    format and writes out n files as described in Dickinson and Meurers
    (2003), "Detecting Errors in Part-of-Speech Annotation" (EACL-03)

    Authors:  Markus Dickinson and Detmar Meurers
    Date:  May 13, 2003
    Paper link:
       http://www.ling.ohio-state.edu/~dm/papers/dickinson-meurers-03.html

"""

# --------------------------------------------------------
# USER SETTINGS
# --------------------------------------------------------

# Please adjust the settings below based on your current system
# configuration and input corpus.

# Token separator (use any string not found in your corpus)
# [We use " ## " by default to separate tokens in a continuous string.
#  Any sequence which does not appear in the corpus will work,
#  but a simple space will not for a corpus which contains
#  multi-word tokens (e.g. "in front of" or "[ mdash")]

tokensep = " ## "
   
# Optional: default filenames to be used if unspecified on command line
# (Files specified on command line will override these settings.)

input_corpus = "../testkorpus_neu.txt"
destination_dir = "../corpus-output/"
output_file_stem = "ngrams"
output_gaps = False

# --------------------------------------------------------
# END USER SETTINGS
# --------------------------------------------------------

# import included modules
import getopt
import sys
import os
import commands
import fileinput

# import bsddb
import bsddb

# import included minimized FreqDist
from nltk_freqdist import *

# specify the long options in arglist
arglist = ['corpus=','directory=','file=','gaps','help']
# parse the command line call
opts, args = getopt.getopt(sys.argv[1:],"c:d:f:h:g",arglist)

# Go through the command line options and see if the user specified a
# directory or a corpus, or asked for help

for option, specification in opts:
    if option in ("-d", "--directory"):
        destination_dir = specification
    elif option in ("-c", "--corpus"):
        input_corpus = specification
    elif option in ("-f", "--file"):
        output_file_stem = specification
    elif option in ("-g","--gaps"):
        output_gaps = True
    elif option in ("-h", "--help"):
        print """
Code to calculate variation n-grams: takes in a corpus in TnT
format and writes out n files to a directory, as described in
Dickinson and Meurers (2003), "Detecting Errors in Part-of-Speech
Annotation" (EACL-03)

Options:

-c/--corpus      specify the (absolute) corpus name
-d/--directory   specify the (absolute) output directory name
-f/--file        specify the base name for the output files
-g/--gaps        write empty lines in reduced and deleted output files
-h/--help        display this help menu
"""
        sys.exit()

# --------------------------------------------------------
# FUNCTIONS

# the function 'get_word_tag' takes a line in TnT format (word '\t'
# tag) and returns the word and tag

def get_word_tag(line):
    spl = line.split('\t')

    try:
        # For tt style:
        word = spl[0]
        tag = spl[1]
    except:
        sys.stderr.write("\n\nError: Incorrectly formatted input corpus.\nThe format expected is 'word \'\t\' tag'.\n")
        sys.stderr.write("Unable to parse line:\n\n" + line + "\n")
        sys.exit(1)

    return word, tag

# the function 'add_to_dict' adds a word and its tag to a given dictionary

def add_to_dict(Dict,word,tag):
    # if the dictionary already has the word, merely increment the tag.
    if Dict.has_key(word):
        Dict[word].inc(tag)
    # otherwise, first create the distribution, then increment the tag.
    else:
        Dict[word] = FreqDist()
        Dict[word].inc(tag)

def to_string(n):

    if (n < 10):
        n_str='00'+str(n)
    elif (n < 100):
        n_str='0'+str(n)
    else:
        n_str=str(n)
    return n_str

# --------------------------------------------------------
# STEP 0: Initialization

# add a trailing "/" if not already there since it's a directory
if destination_dir[len(destination_dir)-1] != "/":
    destination_dir += "/"

# n is the n-gram counter.  We start with unigrams, so we start with 1

n = 1

# initialize Corpus, which will hold the entire corpus indexed from 1
try:
    Corpus = bsddb.btopen(None)
except:
    sys.stderr.write("\n\nError: Unable to open temporary db for corpus\n")
    sys.exit(1)

# initialize index to 1; at the end of the loop index is corpus-size + 1
index = 1

# intialize Dict, which will hold a dictionary keyed by words, accessing
# a frequency distribution of tags
Dict = {}

# --------------------------------------------------------
# STEP 1: Read in corpus

print "Using corpus: "+input_corpus
print "Writing to:   "+destination_dir
sys.stdout.flush()

# concatenate the path name with the file name and a dot for the
# extension, to be used in the rest of the code
destination_dir += output_file_stem + "."

# open the corpus for reading
try:
    corpus_file = open(input_corpus,'r')
except:
    sys.stderr.write("\n\nError: Unable to open " + input_corpus + "\n")
    sys.exit(1)

# tell the reader, we're reading in the corpus (i.e. unigrams)
print "001 grams:",
sys.stdout.flush()

# Read the entire input, storing each word-tag pair in Corpus

# read the first line of the corpus
line = corpus_file.readline()

while line:
    # strip the line of extra newlines, tabs, etc. and split it by tabs
    line = line.rstrip()

    if line:

        # input is in TnT format:  word \t tag
        word, tag = get_word_tag(line)

        # store the line
        Corpus[str(index)] = word + '\t' + tag

        # add word & tag to dictionary Dict
        add_to_dict(Dict, word, tag)

        # increment counter, i.e. next line in corpus
        index = index + 1

        # read the next line of the corpus
        line = corpus_file.readline()

# We're done reading in the corpus, so close that file:
corpus_file.close()

# At the end of the loop: index = last corpus position + 1

print "read in,",
sys.stdout.flush()

# --------------------------------------------------------
# STEP 2:
# put all the ambiguous unigrams into Grams, indexed by the corpus
# position (i.e. identical to Corpus, but only the ambiguously tagged
# words are included)

# set Grams to be an empty dictionary
try:
    Grams = bsddb.btopen(None)
except:
    sys.stderr.write("\n\nError: Unable to create temporary db for n-grams\n")
    sys.exit(1)

# for every corpus position, see if it has multiple tags.  If so,
# store it in Grams.  If not, delete it from Dict, so after this loop Dict
# will only be left with entries that have multiple tags (making
# printing easier).  Note that this is why we must first check that Dict
# has the key word.

for i,line in Corpus.iteritems():
    spl = line.split('\t')
    word = spl[0]

    if (Dict.has_key(word)):
        if (len(Dict[word].samples()) > 1): # if i has more than one tag 
            Grams[i] = Corpus[i]         # save that line in Grams
        else:
            del Dict[word]

        # note that we are only deleting non-varying occurrences, so all
        # corpus positions with a varying n-gram will be saved.


# --------------------------------------------------------
# STEP 3: loop over increasing longer n-grams until none found

# MAIN LOOP: loop until Grams, which stores all varyingly-tagged
# n-grams, has no more elements -- i.e. there are not n-grams of that
# size which are tagged in multiple ways

while Grams:
    # begin by printing out results for the current n
    print "variations found,",
    sys.stdout.flush()

    n_str = to_string(n)
    filename = destination_dir+n_str
#    if os.path.exists(filename):
#        sys.stderr.write("\n\nError: Output file " + filename + " already exists.\n")
#        sys.exit(1)
    try:
        file = open(filename,'w')
    except:
        sys.stderr.write("\n\nError: Unable to open output file " + filename + "\n")
        sys.exit(1)
    
    # print out all the n-grams in Dict, which will be the varying ones

    for words in Dict.keys():
        line = str(Dict[words].N()) + '\t' + words
        for tags in Dict[words].samples():
            count = Dict[words].count(tags)
            line = line + '\t' + str(count) + '\t' + tags
        file.write(line+'\n')
        del Dict[words]

    # close the file -- we are done writing to this n-gram
    file.close()

    # print out a note to the screen that these n-grams are finished.
    print "written to file,",
    sys.stdout.flush()

    # sort the file using unix sort, output into the file itself
    commands.getstatusoutput("sort -nr "+filename+" -o "+filename)

    print "and file sorted."
    sys.stdout.flush()

    # Increment n: we are now dealing with the next higher n-gram
    n = n + 1

    # (re)initialize Store, which will store the n-grams, indexed by the
    # corpus position of the first element in the n-gram
    try:
        Store = bsddb.btopen(None)
    except:
        sys.stderr.write("\n\nError: Unable to open temporary db for storage\n")
        sys.exit(1)

    # (re)initialize Dict, which will for each n-gram corpus position
    # the tag sequence that occurs for any occurrence of this n-gram
    Dict = {}

    # define n_str as str(n) padded with leading 0s, for output/filenames
    n_str = to_string(n)

    # tell user we're starting work on this n:
    print n_str+" grams:",
    sys.stdout.flush()

    for key,line in Grams.iteritems():
        # make a list from the word and tag strings stored in Grams

        words, tags = get_word_tag(line)

        wordlist = words.split(tokensep)
        taglist = tags.split(tokensep)

        numkey = int(key)

        # if there is something in the corpus one position to the 
        # left of the n-1 gram  and we haven't already created an
        # n-gram at that position, then create it
        if (numkey > 1) and (not Store.has_key(str(numkey-1))):

            # newkey is the index of the first word in this n-gram
            newkey = str(numkey-1)

            # push the previous word in the corpus onto the front of the
            # list and the previous tag, as well

            corpline = Corpus[newkey]
            spl = corpline.split('\t')
            wordlist.insert(0,spl[0])
            taglist.insert(0,spl[1])

            # create strings from the n-gram lists
            wordline = tokensep.join(wordlist)
            tagline = tokensep.join(taglist)

            # store the wordline and tagline, indexed by the position
            # of the first word
            Store[newkey] = wordline + '\t' + tagline

            # add the tags to the words' dictionary slot.
            add_to_dict(Dict,wordline,tagline)

            # pop the first element off both lists, so that when we
            # create an n-gram to the right, we will be dealing with
            # the original n-1 gram
            wordlist.pop(0)
            taglist.pop(0)

        # if there is a position in the corpus to the right of the entire
        # n-gram (last corpus position = index-1 !) and no n-gram has
        # been created at this key, create a new n-gram to the right
        if ((numkey + (n-1)) < (index-1)) and (not Store.has_key(key)):

            # newkey will be the position of the *last* token in the
            # n-gram; key will be the position we index the n-gram on,
            # i.e. the starting position
            newkey = str(numkey+(n-1))

            # append the rightmost word and tag onto their lists

            corpline = Corpus[newkey]
            spl = corpline.split('\t')
            wordlist.append(spl[0])
            taglist.append(spl[1])

            # create strings from the lists
            wordline = tokensep.join(wordlist)
            tagline = tokensep.join(taglist)

            # Store the wordline and tagline, indexed by the position
            # of the first word
            Store[key] = wordline + '\t' + tagline

            # add the tags to the words' dictionary slot.
            add_to_dict(Dict,wordline,tagline)
        #key = Grams.next()
    # end for (key in Grams.keys())

    # reinitialize Grams
    Grams.close()
    try:
        Grams = bsddb.btopen(None)
    except:
        sys.stderr.write("\n\nError: Unable to open temporary db for n-grams\n")
        sys.exit(1)

    # print a note to the screen that these n-grams have been indexed.
    print "read in,",
    sys.stdout.flush()

    # loop over the indexed positions in the Store
    for i in Store.keys():

        storeline = Store[i]
        spl = storeline.split('\t')
        
        # get the word n-gram
        words = spl[0]

        # if we have not yet deleted words, then if is a varying
        # n-gram, put it in Grams.  Otherwise, get rid of it from Dict
        if (Dict.has_key(words)):
            if (len(Dict[words].samples()) > 1):
                # Fill the Grams dictionary from the Store one
                Grams[i] = Store[i]
            else:
                del Dict[words]

        del Store[i]

    # end for (i in Store.keys())

    Store.close()

# end while Grams

print "and no variations found."
sys.stdout.flush()

# close corpus files

Corpus.close()
Grams.close()



##### additional code by hildebax: delete n-gram that occour in (m>n)-grams



def process_line(line):
    split = line.strip().split('\t')
    words = split[1]
    poss = ''
    k = 3
    while k-1:
                    delete_words.append(words_n)
            for words_n in delete_words:
                del n_map[words_n]
        n_map_newSize = len(n_map)                        

        if n_map_oldSize==n_map_newSize:
            break

    f_reduced = open(destination_dir + 'reduced.' + to_string(i),'w')
    f_deleted = open(destination_dir + 'deleted.' + to_string(i),'w') 
    for line_i in fileinput.input([destination_dir + to_string(i)]):
        words,poss = process_line(line_i)
        if words in n_map:
            f_reduced.write(line_i)
            if output_gaps: f_deleted.write('\n')
        else:
            if output_gaps: f_reduced.write('\n')
            f_deleted.write(line_i)
    f_reduced.close()
    f_deleted.close()