<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE TEI.2 PUBLIC "-//TEI P4//DTD Main Document Type//EN"
"http://www.infomotions.com/alex/dtd/tei2.dtd" [
<!ENTITY % TEI.XML         'INCLUDE' >
<!ENTITY % TEI.prose       'INCLUDE' >
<!ENTITY % TEI.linking     'INCLUDE' >
<!ENTITY % TEI.figures     'INCLUDE' >
<!ENTITY % TEI.names.dates 'INCLUDE' >
<!ATTLIST xptr   url CDATA #IMPLIED >
<!ATTLIST xref   url CDATA #IMPLIED >
<!ATTLIST figure url CDATA #IMPLIED >
]> 
<TEI.2>
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>define.py - given a carrel and a word, output definitions of the word</title> 
        <author>Eric Lease Morgan</author>
        <respStmt>
          <resp>converted into TEI-conformant markup by</resp>
          <name>Eric Lease Morgan</name>
        </respStmt>
      </titleStmt>
      <publicationStmt>
        <publisher>Eric Lease Morgan, &#169; University of Notre Dame</publisher>
        <address>
        	<addrLine>emorgan@nd.edu</addrLine>
        </address>
        <distributor>Available through the Distant Reader at <xptr url='https://distantreader.org/blog/define/' />.</distributor>
        <idno type='reader'>44</idno>
        <availability status='free'>
          <p>This document is distributed under a GNU Public License.</p>
        </availability>
      </publicationStmt>
      <notesStmt>
       <note type='abstract'>tldnr - given lists of sentences and a word, use the Lesk algorithm to 
# first disambiguate the word, and then output the word's WordNet definition</note>
      </notesStmt>
      <sourceDesc>
        <p>This is the original publication of this item</p>
      </sourceDesc>
    </fileDesc>
    <profileDesc>
      <creation>
        <date>2023-11-06</date>
      </creation>
      <textClass>
        <keywords>
          <list><item>hacks</item></list>
        </keywords>
      </textClass>
    </profileDesc>
    <revisionDesc>
      <change>
<date>2023-11-06</date>
<respStmt>
<name>Eric Lease Morgan</name>
</respStmt>
<item>initial TEI encoding</item>
</change>
    </revisionDesc>
  </teiHeader>
  <text>
    <front>
    </front>
    <body>
      <div1><p rend='pre'>#!/usr/bin/env python

# <xref url='./define.py'>define.py</xref> - given a carrel and a word, output definitions of the word

# tldnr - given lists of sentences and a word, use the Lesk algorithm to 
# first disambiguate the word, and then output the word's WordNet definition

#  sample usage: $ ./define.py homer Hector
# sample output:
#
#      synset: hector.n.01
#   frequency: 396
#  definition: (Greek mythology) a mythical Trojan who was killed by Achilles
#              during the Trojan War
#
#      synset: strong-arm.v.02
#   frequency: 56
#  definition: be bossy towards

#  sample usage: $ ./define.py frankenstein fiend
# sample output:
#
#      synset: monster.n.04
#   frequency: 36
#  definition: a cruel wicked and inhuman person
#
#      synset: fanatic.n.01
#   frequency: 4
#  definition: a person motivated by irrational enthusiasm (as for a cause);
#              --Winston Churchill
#
#      synset: devil.n.02
#   frequency: 1
#  definition: an evil supernatural being


# Eric Lease Morgan &#60;emorgan@nd.edu&#62;
# (c) University of Notre Dame; distributed under a GNU Public License

# March   27, 2023 - first cut
# June     6, 2023 - added command line input
# November 5, 2023 - used more intelligent tokenization, and greatly refined output


# configure
LIBRARY = 'localLibrary'
VERBOSE = False

# require
from   nltk.wsd import lesk
from   nltk     import word_tokenize
from   rdr      import configuration, ETC, SENTENCES, Sentences
import sys

# get input
if len( sys.argv ) != 3 : sys.exit( "Usage: " + sys.argv[ 0 ] + " &#60;carrel&#62; &#60;word&#62;" )
carrel = sys.argv[ 1 ]
word   = sys.argv[ 2 ]

# initialize
library   = configuration( LIBRARY )
sentences = library/carrel/ETC/SENTENCES

# get and process each sentence; create a set of matching results
results = [] 
for sentence in Sentences( sentences ) : 
	
	# normalize
	sentence = sentence.rstrip()
	
	# filter
	if word.lower() in sentence.lower() :
					
		# disambiguate; the magic happens here
		synset = lesk( word_tokenize( sentence ), word )
		
		# update, conditionally
		if synset : results.append( ( synset, sentence ) )
	
# count &#38; tabulate the results
synsets = {}
for result in results :

	# parse
	synset = result[ 0 ]
	
	# count and tabulate
	if synset in synsets : synsets[ synset ] += 1
	else                 : synsets[ synset ] =  1

# sort and process each resulting synset; tricky
synsets = dict( sorted( synsets.items(), key=lambda x:x[ 1 ], reverse=True ) )
for synset in synsets.keys() :
	
	# get the list of matching sentences; pythonic
	sentences = [ result[ 1 ] for result in results if result[ 0 ] is synset ]
		
	# output
	sys.stderr.write( '      synset: ' + synset.name()            + '\n' )	
	sys.stderr.write( '   frequency: ' + str( synsets[ synset ] ) + '\n' )	
	sys.stderr.write( '  definition: ' + synset.definition()      + '\n' )
	
	# ouput some more
	if VERBOSE :
		for sentence in sentences :
			sys.stderr.write( '    sentence: ' + sentence     + '\n' )
	
	# delimit
	sys.stderr.write( '\n' )

# done
exit()</p>


</div1>

    </body>
    <back>
    </back>
  </text>
</TEI.2>
