From: Neil Smith Date: Fri, 30 Sep 2011 15:14:37 +0000 (+0100) Subject: Done the nonlinear classifer X-Git-Url: https://git.njae.me.uk/?p=pci.git;a=commitdiff_plain Done the nonlinear classifer --- diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..5c6621e --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,25 @@ +GEM + remote: http://rubygems.org/ + specs: + diff-lcs (1.1.3) + gnuplot (2.3.6) + rake (0.9.2) + rdoc (3.9.4) + rspec (2.6.0) + rspec-core (~> 2.6.0) + rspec-expectations (~> 2.6.0) + rspec-mocks (~> 2.6.0) + rspec-core (2.6.4) + rspec-expectations (2.6.0) + diff-lcs (~> 1.1.2) + rspec-mocks (2.6.0) + +PLATFORMS + ruby + +DEPENDENCIES + bundler (~> 1.0.0) + gnuplot + rake + rdoc + rspec (~> 2.6.0) diff --git a/lib/svm/svm.rb b/lib/svm/svm.rb index e4a4d7f..a681d9f 100644 --- a/lib/svm/svm.rb +++ b/lib/svm/svm.rb @@ -6,6 +6,7 @@ class ClassifiedData def initialize(data = Array.new, classification = nil) @data = data @classification = classification + self end end @@ -18,6 +19,15 @@ def load_data(filename, only_numbers = false) rows end +def matches_to_numeric(rows) + rows.map do |row| + d = row.data + ClassifiedData.new([d[0].to_f, yes_no(d[1]), yes_no(d[2]), + d[5].to_f, yes_no(d[6]), yes_no(d[7]), + match_count(d[3], d[8])], + row.classification) + end +end def plot_age_matches(rows) Gnuplot.open do |gp| @@ -77,7 +87,7 @@ def dot_product_classify(point, averages) end end -def yesno(v) +def yes_no(v) if v == 'yes' then 1 elsif v == 'no' then -1 else 0 @@ -92,4 +102,71 @@ def miles_distance(a1, a2) 0 end +def scale_data_set(rows) + # Could be many rows, so still make one pass through the data rather than + # using Array#max and #min for each data field + lows = Array.new(rows[0].data.length, 999999999.0) + highs = Array.new(rows[0].data.length, -999999999.0) + rows.each do |row| + data = row.data + (data.length).times do |i| + lows[i] = data[i] if data[i] < lows[i] + highs[i] = data[i] if data[i] > highs[i] + end + end + + scale_data = Proc.new do |row| + row.zip(lows, highs).map {|d| (d[0] - d[1]) / (d[2] - d[1]) } + end + + new_rows = rows.map do |row| + ClassifiedData.new(scale_data.call(row.data), row.classification) + end + + return new_rows, scale_data +end + +# Usage: +# numeric_matches = matches_to_numeric matches +# scaled_set, scale_f = scale_data_set numeric_matches +# averages = linear_train scaled_set +# dot_product_classify(scale_f.call(numeric_matches[11].data), averages) + +def radial_basis(v1, v2, gamma = 20) + len = Math.sqrt((v1.zip v2).map {|c| (c[0] - c[1]) ** 2 }.reduce(:+)) + Math.exp(-gamma * len) +end + +def nonlinear_classify(point, rows, offset, gamma = 10) + match_sum = no_match_sum = 0.0 + match_count = no_match_count = 0 + rows.each do |row| + if row.classification == 1 + match_sum += radial_basis(point, row.data, gamma) + match_count += 1 + else + no_match_sum += radial_basis(point, row.data, gamma) + no_match_count += 1 + end + end + y = match_sum / match_count - no_match_sum / no_match_count + offset + if y < 0 + 0 + else + 1 + end +end +def nonlinear_offset(rows, gamma = 10) + matches = [] ; no_matches = [] + rows.each do |r| + if r.classification == 1 + matches << r.data + else + no_matches << r.data + end + end + sum_matches = matches.map {|v1| matches.map {|v2| radial_basis(v1, v2, gamma)}.reduce(:+)}.reduce(:+) + sum_no_matches = no_matches.map {|v1| no_matches.map {|v2| radial_basis(v1, v2, gamma)}.reduce(:+)}.reduce(:+) + sum_matches / matches.length ** 2 - sum_no_matches / no_matches.length ** 2 +end