Source code for detect_spam

#!/usr/bin/python2.7
# Copyright 2011 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Handler for spam note detection, store bad word list and provide
   utilities to evaluate the quality of notes."""

__author__ = 'shaomei@google.com (Shaomei Wu)'

import unicodedata
import logging
import re
import jautils

[docs]def normalize(string): """Normalize a string to all lowercase and remove accents. """ string = unicode(string or '').strip().lower() # Normalize unicode to normal form D (NDF) - canonical decomposition. # Translate each character into its decomposed form (accents removed). string = unicodedata.normalize('NFD', string) return string
[docs]class SpamDetector(): bad_words_set = set() def __init__(self, bad_words): if bad_words == '' or bad_words == None: return # Input bad words are seperated by comma. for word in re.split(',\s*', bad_words): # Normalized the bad word and add it to the list. normalized_word = normalize(word) self.bad_words_set.add(normalized_word)
[docs] def estimate_spam_score(self, text): """Estimate the probability of the input text being spam. Returns: a float score between [0,1], or None if text is empty after normalization. """ # Normalize text normalized_text = normalize(text) # Tokenize the text into words. Currently we keep hypen and # apostrophe in the words but filter all the other punctuation marks. # TODO(shaomei): better ways to tokenize CJK text. # Split out each CJK ideograph as its own word probably # is not he best way of tokenization. We can do bigram in # the future. words = re.findall("\w+-\w+|[\w']+", normalized_text) # Look for bad word in the text by string match. bad_words_matched = self.bad_words_set.intersection( set(words) ) # Simple way to calculate spam score for now. if len(words) == 0: logging.debug('input text contains no words.') return None else: spam_score = float(len(bad_words_matched))/float(len(words)) return spam_score