Source code for detect_spam
#!/usr/bin/python2.7
# Copyright 2011 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Handler for spam note detection, store bad word list and provide
utilities to evaluate the quality of notes."""
__author__ = 'shaomei@google.com (Shaomei Wu)'
import unicodedata
import logging
import re
import jautils
[docs]def normalize(string):
"""Normalize a string to all lowercase and remove accents. """
string = unicode(string or '').strip().lower()
# Normalize unicode to normal form D (NDF) - canonical decomposition.
# Translate each character into its decomposed form (accents removed).
string = unicodedata.normalize('NFD', string)
return string
[docs]class SpamDetector():
bad_words_set = set()
def __init__(self, bad_words):
if bad_words == '' or bad_words == None:
return
# Input bad words are seperated by comma.
for word in re.split(',\s*', bad_words):
# Normalized the bad word and add it to the list.
normalized_word = normalize(word)
self.bad_words_set.add(normalized_word)
[docs] def estimate_spam_score(self, text):
"""Estimate the probability of the input text being spam.
Returns:
a float score between [0,1], or None if text is empty
after normalization.
"""
# Normalize text
normalized_text = normalize(text)
# Tokenize the text into words. Currently we keep hypen and
# apostrophe in the words but filter all the other punctuation marks.
# TODO(shaomei): better ways to tokenize CJK text.
# Split out each CJK ideograph as its own word probably
# is not he best way of tokenization. We can do bigram in
# the future.
words = re.findall("\w+-\w+|[\w']+", normalized_text)
# Look for bad word in the text by string match.
bad_words_matched = self.bad_words_set.intersection( set(words) )
# Simple way to calculate spam score for now.
if len(words) == 0:
logging.debug('input text contains no words.')
return None
else:
spam_score = float(len(bad_words_matched))/float(len(words))
return spam_score