# -*- coding: iso-8859-1 *-* """ SenteceSplitter.py for splitting chunks of text into ortographic sentences. Contains a class SentenceSplitter which is used to split paragraphs into sentences using a simple punctuation mark detecting regular expression and a list of abbreviations that should not trigger a split. Improves on the original code by Mickel Grönroos by adding methods for retrieving and changing the regular expression that does the sentence splitting, which can be useful if you want to change the delimiters of the sentence. I've also changed the default abbreviation list so that single capital letters --- as in "John B. Smith" --- will be ignored. The problem associated to this strategy is that sentences ending in such characters won't be detected either, i.e.: "'It was me," said John B. But the truth was other." There is no workaround for this problem: you must choose between one of these two methods (or teach your computer semantics). Methods are provided to enable or disable 'single caps-detection'. Copyright (C) 2007 Denis Fernandez Cabrera Copyright (C) 2004 Mickel Grönroos This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Contact information: For problems with this version of the module, please contact me (Denis Fernandez Cabrera) at: denis@ceibes.org And the source for this module can be found at: http://ceibes.org/gnd/sentence-splitter#attachments The source for the original module can be found at: http://www.pojkfilmsklubben.org/mickel/code/python/SentenceSplitter.py And the original author, for questions pertaining to his code, can be contacted at: Mickel.Gronroos@csc.fi Mickel Grönroos The Finnish IT center for science CSC P.O. Box 405 FIN-02101 ESPOO Finland Version: $Id: SentenceSplitter.py,v 1.6 2005/03/15 13:24:17 denis Exp $ Known limitations: - Cannot handle correctly citations (in citation marks) that contains multiple sentences, e.g. "This is a citation. It has two sentences." - Returns empty strings for empty lines. This could either be considered a bug or a feature. ;o) - Has problems with the localization, which probably are in turn caused by my (Denis) unknowledge on the way Python works with codecs. This is a TO-DO item. """ import re import locale import string import os __version__ = "1.0" ## Default module variables if os.name == "posix": LOCALE = "es_ES" else: LOCALE = "es" ESCAPE = [(".", "_PERIOD_"), (":", "_COLON_"), (";", "_SEMICOLON_")] ## A pattern to go with the regular expression for splitting REGEXP = ("""([\!\"#\'\(\)\.\?]+) ([\"\'\(\-]*\s?["""+string.uppercase+string.digits+"""])""") # (With semicolon added as a delimitter) #regular_expression = ("""([\!\"#\'\(\)\.\?;]+) ([\"\'\(\-]*\s?[""" # +string.uppercase+string.digits+"""])""") # Default list of abbreviations. ABBREVIATIONS = [] # The list of single capital letters followed by a colon. CAPITALS = [ x+"." for x in string.uppercase ] class SentenceSplitter(object): """The SentenceSplitter class.""" def __init__(self, loc = LOCALE, abbreviations = ABBREVIATIONS, escape = ESCAPE, regexp = REGEXP ): '''Construct a SentenceSplitter object. Parameters: 1. loc (a string or tuple to feed locale.setlocale() (Default: '''+str(LOCALE)+''') 2. abbreviations (a "stop list" of abbreviations that should not be split, including punctuation. Single capital letters will be added to this list automatically) (Default: '''+str(ABBREVIATIONS)+''') 3. escape (a sequence of tuples to escape punctuation in the stop list) (Default: """+str(ESCAPE)+""") 4. regexp (a string with the regular expression to do the splitting) (Default: '''+str(REGEXP)+''') ''' ## Set prerequisites #self.setLocale(loc) self.setAbbreviations(abbreviations + CAPITALS) self.setEscape(escape) self.setRegexp(regexp) def setLocale(self, loc): """Sets the locale. Parameter must be in the format accepted by locale.setlocale().""" locale.setlocale(locale.LC_ALL, loc) def getLocale(self): """Returns the current locale.""" return locale.getlocale() def setAbbreviations(self, abbreviations): """Sets the abbreviation "stop list", i.e. a list of abbreviations that should not trigger a split.""" self._abbreviations = abbreviations def getAbbreviations(self): """Returns the "stop list" of abbreviations.""" try: return self._abbreviations except: return [] def setEscape(self, escape): """Sets the the escape handling, i.e. how the punctuation characters in the abbreviation stop list should be escaped before splitting and turned back to after splitting. The parameter should be a sequence of tuples. (Example: escape=[(".", "_PERIOD_"), (":", "_COLON_")] """ self._escape = escape def getEscape(self): """Returns the current sequence of tuples used for escaping punctuation in the abbreviations in the stop list.""" try: return self._escape except: return [] def setRegexp(self, regexp): """ Sets the regular expression string that will be used for the splitting. Keep in mind that the backslash characters must be escaped, i.e.: regexp = '([\!\"#\'\(\)\.\?]+)' should be given as: regexp = '([\\!\\"#\\'\\(\\)\\.\\?]+)' """ # The regular expression matching sentence boundaries self._regexpstring = regexp # A pattern to go with the regular expression for splitting # a chunk of text into sentences self._replacepattern = r"\1\n\2" # Compile the regular expression object self._regexpobject = re.compile(self._regexpstring) def getRegexp(self): """Returns the regular expression that is to be used for sentence splitting. Keep in mind that the backslash characters will be escaped, i.e. the returned regular expression: '([\\!\\"#\\'\\(\\)\\.\\?]+)' would really be: '([\!\"#\'\(\)\.\?]+)' """ try: return self._regexpstring except: return "" def getCapState(self): """ Returns true if 'single-capital-with-period' detection is active, False otherwise. """ abbreviations = self.getAbbreviations() for letter in CAPITALS: if letter not in abbreviations: return False else: return True def switchCapState(self): """ Switches the splitter between breaking sentences at single capitals with a period, such as 'It was done by John B. Although nobody knew.' and not. """ abbreviations = self.getAbbreviations() if self.getCapState(): for letter in CAPITALS: abbreviations.remove(letter) self.setAbbreviations(abbreviations) else: self.setAbbreviations(abbreviations + CAPITALS) def split(self, text): """Splits a chunk of text into a list of sentences.""" ## First "escape" all abbreviations in a rather ugly manner for abbrev in self.getAbbreviations(): if text.count(abbrev): for t_escapemapping in self.getEscape(): escabbrev = abbrev.replace(t_escapemapping[0], t_escapemapping[1]) text = text.replace(abbrev, escabbrev) ## Then try doing the replace given the regular expression ## and the replace pattern sentencestring = self._regexpobject.sub(self._replacepattern, text) ## Now "unescape" the abbreviations for t_escapemapping in self.getEscape(): if sentencestring.count(t_escapemapping[1]): sentencestring = sentencestring.replace(t_escapemapping[1], t_escapemapping[0]) ## Split sentencestring on newlines and return the list return sentencestring.split("\n") ## Self-test code if __name__ == '__main__': ## Use the codecs module to enable easy encoding/decoding import codecs ## A (way too) small collection of abbreviations: abb = ['Mr.', 'Jr.', 'Dr.', 'Ms.', 'Mrs.', 'Ph.D.'] ## Create the SentenceSplitter object ss = SentenceSplitter(abbreviations = abb) ## Ask the user for a file containing test text and for the encoding ## of the file filename = raw_input('File: ') # encoding = raw_input('File encoding: ') ## Open the file for reading # fh = codecs.open(filename, encoding=encoding) fh = open(filename) ## Read all lines in the file, strip whitespace and newlines at the end ## and join it all into a large textchunk string with all stuff on one line #lines = map((lambda x: x.rstrip()), fh.readlines()) #textchunk = string.join(lines, ' ') textchunk = fh.read() print "Split file:" for line in ss.split(textchunk): print ">>> "+line