#!/usr/bin/ruby -w require 'getoptlong' $PATH = "/usr/local/bin"; # Path to dspam binaries $TRAINING_MODE = "teft"; # Training mode $FORMAIL = "formail -I X-DSPAM-Result -I X-DSPAM-Confidence -I X-DSPAM-Probability -I X-DSPAM-Signature -I X-DSPAM-Factors -I X-Spamprobe" class TrainingSession attr_accessor :reportingWindow def initialize(user, trainingMode, corpusDir) @user = user @trainingMode = trainingMode @corpusDir = corpusDir @innocentCount = 0 @spamCount = 0 @falsePositiveCount = 0 @spamMissedCount = 0 @reportingWindow = 250 @maxRetrainTries = 3 end def run innocentDir = Dir.new("#@corpusDir/nonspam") spamDir = Dir.new("#@corpusDir/spam") innocentMsgs = getDirFiles(innocentDir) spamMsgs = getDirFiles(spamDir) puts "nb innocentMsgs : " + innocentMsgs.size.to_s puts "nb spamMsgs : " + spamMsgs.size.to_s msgCount = 0 while (!innocentMsgs.empty? && !spamMsgs.empty?) innocent = innocentMsgs.pop spam = spamMsgs.pop process_innocent "#{innocentDir.path}/#{innocent}" process_spam "#{spamDir.path}/#{spam}" msgCount += 1 if msgCount % @reportingWindow == 0 puts "Spam Correct : #@spamCount" puts "Spam Missed : #@spamMissedCount" puts "Nonspam Correct: #@innocentCount" puts "Nonspam Missed : #@falsePositiveCount" puts "--------------------" @innocentCount = 0 @spamCount = 0 @falsePositiveCount = 0 @spamMissedCount = 0 end end end def getDirFiles dir dir.entries.select { |f| File.file?("#{dir.path}/#{f}") } end def process_msg msgFile cmd = "#{$FORMAIL} < #{msgFile} | #{$PATH}/dspam --user #@user --mode=#@trainingMode --deliver=stdout "; # puts "cmd : '#{cmd}'" a = IO.readlines("| #{cmd}"); dspamRes = a.grep(/X-DSPAM-Result:/) # return dspamRes #puts "process_msg #{msgFile} : res = #{dspamRes}" return a, dspamRes[0].strip end def process_error(msg, msgClass) cmd = "#{$PATH}/dspam --user #@user --mode=#@trainingMode --class=#{msgClass} --source=error --deliver=stdout "; io = IO.popen("#{cmd}", "r+"); msg.each { |l| io.print l } io.close_write a = io.readlines io.close end def retrain_error(msgFile, processedMsg, msgClass) puts "Retraining #{msgFile} for class #{msgClass}" retrainTryNb = 0 @maxRetrainTries.times do process_error processedMsg, msgClass retrainedProcessedMsg, rr = process_msg msgFile puts "retraining result : #{rr}" if ((msgClass == "innocent" && rr =~ /Innocent|Whitelisted/) || (msgClass == "spam" && rr =~ /Spam/)) puts "Retraining succeeded" return true end end # retraining failed puts "Retraining failed" return false end def process_innocent msgFile msg, r = process_msg msgFile if r =~ /Innocent|Whitelisted/ #puts "process_innocent : ok" @innocentCount += 1 else puts "process_innocent : retraining #{msgFile}" @falsePositiveCount += 1 # retrain retrain_error msgFile, msg, "innocent" end end def process_spam msgFile msg, r = process_msg msgFile if r =~ /Spam/ @spamCount += 1 #puts "process_spam : ok" else puts "process_spam : retraining #{msgFile}" @spamMissedCount += 1 # retrain retrain_error msgFile, msg, "spam" end end end ### And now for something completely different... trainingMode = $TRAINING_MODE opts = GetoptLong.new( [ "--trainingmode", "-t", GetoptLong::REQUIRED_ARGUMENT ] ) opts.each do |opt, arg| case opt when "--trainingmode" trainingmode = arg else raise "Unrecognized option #{opt}" end end user = ARGV[0] corpusDir = ARGV[1] ts = TrainingSession.new(user, trainingMode, corpusDir) ts.reportingWindow = 50 ts.run