4 attr_accessor
:data, :classification
6 def initialize(data = Array
.new
, classification
= nil)
8 @classification = classification
13 def load_data(filename
, only_numbers
= false)
15 IO
.foreach(filename
) do |line
|
16 rows
<< ClassifiedData
.new(line
.chomp
.split(',')[0..-2].map
{|field
| only_numbers
? field
.to_f
: field
},
17 line
.chomp
.split(',')[-1].to_i
)
22 def matches_to_numeric(rows
)
25 ClassifiedData
.new([d
[0].to_f
, yes_no(d
[1]), yes_no(d
[2]),
26 d
[5].to_f
, yes_no(d
[6]), yes_no(d
[7]),
27 match_count(d
[3], d
[8])],
32 def plot_age_matches(rows
)
34 Gnuplot
::Plot.new( gp
) do |plot
|
36 plot
.title
"Ages of matches"
40 matches
= rows
.select
{|r
| r
.classification
== 1}.map
{|r
| r
.data}
41 non_matches
= rows
.select
{|r
| r
.classification
== 0}.map
{|r
| r
.data}
43 plot
.data = [ Gnuplot
::DataSet.new( [matches
] ) do |ds
|
46 end , Gnuplot
::DataSet.new( [non_matches
] ) do |ds
|
55 def linear_train(rows
)
61 row_class
= row
.classification
62 sums
[row_class
] ||= [0.0] * (row
.data.length
)
64 (row
.data.length
).times
do |i
|
65 sums
[row_class
][i
] += row
.data[i
]
67 counts
[row_class
] += 1
70 sums
.keys
.each
do |match_class
|
71 averages
[match_class
] = sums
[match_class
].map
{|sum
| sum
/ counts
[match_class
]}
76 def dot_product(v1
, v2
)
77 (v1
.zip v2
).map
{|c
| c
.reduce(:*) }.reduce(:+)
80 def dot_product_classify(point
, averages
)
81 b
= (dot_product(averages
[1], averages
[1]) - dot_product(averages
[0], averages
[0])) / 2
82 y
= dot_product(point
, averages
[0]) - dot_product(point
, averages
[1]) + b
92 elsif v
== 'no' then -1
97 def match_count(interests1
, interests2
)
98 (interests1
.split(':') & interests2
.split(':')).length
101 def miles_distance(a1
, a2
)
105 def scale_data_set(rows
)
106 # Could be many rows, so still make one pass through the data rather than
107 # using Array#max and #min for each data field
108 lows
= Array
.new(rows
[0].data.length
, 999999999.0)
109 highs
= Array
.new(rows
[0].data.length
, -999999999.0)
112 (data.length
).times
do |i
|
113 lows
[i
] = data[i
] if data[i
] < lows
[i
]
114 highs
[i
] = data[i
] if data[i
] > highs
[i
]
118 scale_data
= Proc
.new
do |row
|
119 row
.zip(lows
, highs
).map
{|d
| (d
[0] - d
[1]) / (d
[2] - d
[1]) }
122 new_rows
= rows
.map
do |row
|
123 ClassifiedData
.new(scale_data
.call(row
.data), row
.classification
)
126 return new_rows
, scale_data
130 # numeric_matches = matches_to_numeric matches
131 # scaled_set, scale_f = scale_data_set numeric_matches
132 # averages = linear_train scaled_set
133 # dot_product_classify(scale_f.call(numeric_matches[11].data), averages)
135 def radial_basis(v1
, v2
, gamma
= 20)
136 len
= Math
.sqrt((v1
.zip v2
).map
{|c
| (c
[0] - c
[1]) ** 2 }.reduce(:+))
137 Math
.exp(-gamma
* len
)
140 def nonlinear_classify(point
, rows
, offset
, gamma
= 10)
141 match_sum
= no_match_sum
= 0.0
142 match_count
= no_match_count
= 0
144 if row
.classification
== 1
145 match_sum
+= radial_basis(point
, row
.data, gamma
)
148 no_match_sum
+= radial_basis(point
, row
.data, gamma
)
152 y
= match_sum
/ match_count - no_match_sum / no_match_count
+ offset
160 def nonlinear_offset(rows
, gamma
= 10)
161 matches
= [] ; no_matches
= []
163 if r
.classification
== 1
169 sum_matches
= matches
.map
{|v1
| matches
.map
{|v2
| radial_basis(v1
, v2
, gamma
)}.reduce(:+)}.reduce(:+)
170 sum_no_matches
= no_matches
.map
{|v1
| no_matches
.map
{|v2
| radial_basis(v1
, v2
, gamma
)}.reduce(:+)}.reduce(:+)
171 sum_matches
/ matches.length ** 2 - sum_no_matches / no_matches
.length
** 2