define.py - given a carrel and a word, output definitions of the word

]> define.py - given a carrel and a word, output definitions of the word Eric Lease Morgan converted into TEI-conformant markup by Eric Lease Morgan Eric Lease Morgan, © University of Notre Dame

emorgan@nd.edu

Available through the Distant Reader at . 44

This document is distributed under a GNU Public License.

tldnr - given lists of sentences and a word, use the Lesk algorithm to # first disambiguate the word, and then output the word's WordNet definition

This is the original publication of this item

2023-11-06 hacks 2023-11-06 Eric Lease Morgan initial TEI encoding

#!/usr/bin/env python # define.py - given a carrel and a word, output definitions of the word # tldnr - given lists of sentences and a word, use the Lesk algorithm to # first disambiguate the word, and then output the word's WordNet definition # sample usage: $ ./define.py homer Hector # sample output: # # synset: hector.n.01 # frequency: 396 # definition: (Greek mythology) a mythical Trojan who was killed by Achilles # during the Trojan War # # synset: strong-arm.v.02 # frequency: 56 # definition: be bossy towards # sample usage: $ ./define.py frankenstein fiend # sample output: # # synset: monster.n.04 # frequency: 36 # definition: a cruel wicked and inhuman person # # synset: fanatic.n.01 # frequency: 4 # definition: a person motivated by irrational enthusiasm (as for a cause); # --Winston Churchill # # synset: devil.n.02 # frequency: 1 # definition: an evil supernatural being # Eric Lease Morgan <emorgan@nd.edu> # (c) University of Notre Dame; distributed under a GNU Public License # March 27, 2023 - first cut # June 6, 2023 - added command line input # November 5, 2023 - used more intelligent tokenization, and greatly refined output # configure LIBRARY = 'localLibrary' VERBOSE = False # require from nltk.wsd import lesk from nltk import word_tokenize from rdr import configuration, ETC, SENTENCES, Sentences import sys # get input if len( sys.argv ) != 3 : sys.exit( "Usage: " + sys.argv[ 0 ] + " <carrel> <word>" ) carrel = sys.argv[ 1 ] word = sys.argv[ 2 ] # initialize library = configuration( LIBRARY ) sentences = library/carrel/ETC/SENTENCES # get and process each sentence; create a set of matching results results = [] for sentence in Sentences( sentences ) : # normalize sentence = sentence.rstrip() # filter if word.lower() in sentence.lower() : # disambiguate; the magic happens here synset = lesk( word_tokenize( sentence ), word ) # update, conditionally if synset : results.append( ( synset, sentence ) ) # count & tabulate the results synsets = {} for result in results : # parse synset = result[ 0 ] # count and tabulate if synset in synsets : synsets[ synset ] += 1 else : synsets[ synset ] = 1 # sort and process each resulting synset; tricky synsets = dict( sorted( synsets.items(), key=lambda x:x[ 1 ], reverse=True ) ) for synset in synsets.keys() : # get the list of matching sentences; pythonic sentences = [ result[ 1 ] for result in results if result[ 0 ] is synset ] # output sys.stderr.write( ' synset: ' + synset.name() + '\n' ) sys.stderr.write( ' frequency: ' + str( synsets[ synset ] ) + '\n' ) sys.stderr.write( ' definition: ' + synset.definition() + '\n' ) # ouput some more if VERBOSE : for sentence in sentences : sys.stderr.write( ' sentence: ' + sentence + '\n' ) # delimit sys.stderr.write( '\n' ) # done exit()