1
2
3 """
4 This module provides code for doing logistic regressions.
5
6
7 Classes:
8 LogisticRegression Holds information for a LogisticRegression classifier.
9
10
11 Functions:
12 train Train a new classifier.
13 calculate Calculate the probabilities of each class, given an observation.
14 classify Classify an observation into a class.
15 """
16
17
18 try:
19 set = set
20 except NameError:
21 from sets import Set as set
22
23
24
25 import numpy
26 import numpy.linalg
27
29 """Holds information necessary to do logistic regression
30 classification.
31
32 Members:
33 beta List of the weights for each dimension.
34
35 """
37 """LogisticRegression()"""
38 self.beta = []
39
40 -def train(xs, ys, update_fn=None, typecode=None):
41 """train(xs, ys[, update_fn]) -> LogisticRegression
42
43 Train a logistic regression classifier on a training set. xs is a
44 list of observations and ys is a list of the class assignments,
45 which should be 0 or 1. xs and ys should contain the same number
46 of elements. update_fn is an optional callback function that
47 takes as parameters that iteration number and log likelihood.
48
49 """
50 if len(xs) != len(ys):
51 raise ValueError("xs and ys should be the same length.")
52 classes = set(ys)
53 if classes != set([0, 1]):
54 raise ValueError("Classes should be 0's and 1's")
55 if typecode is None:
56 typecode = 'd'
57
58
59
60 N, ndims = len(xs), len(xs[0]) + 1
61 if N==0 or ndims==1:
62 raise ValueError("No observations or observation of 0 dimension.")
63
64
65 X = numpy.ones((N, ndims), typecode)
66 X[:, 1:] = xs
67 Xt = numpy.transpose(X)
68 y = numpy.asarray(ys, typecode)
69
70
71 beta = numpy.zeros(ndims, typecode)
72
73 MAX_ITERATIONS = 500
74 CONVERGE_THRESHOLD = 0.01
75 stepsize = 1.0
76
77
78 iter = 0
79 old_beta = old_llik = None
80 while iter < MAX_ITERATIONS:
81
82 ebetaX = numpy.exp(numpy.dot(beta, Xt))
83 p = ebetaX / (1+ebetaX)
84
85
86 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
87 llik = sum(logp)
88 if update_fn is not None:
89 update_fn(iter, llik)
90
91
92 if llik < old_llik:
93 stepsize = stepsize / 2.0
94 beta = old_beta
95
96 if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
97 break
98 old_llik, old_beta = llik, beta
99 iter += 1
100
101 W = numpy.identity(N) * p
102 Xtyp = numpy.dot(Xt, y-p)
103 XtWX = numpy.dot(numpy.dot(Xt, W), X)
104
105
106
107 delta = numpy.linalg.solve(XtWX, Xtyp)
108 if numpy.fabs(stepsize-1.0) > 0.001:
109 delta = delta * stepsize
110 beta = beta + delta
111 else:
112 raise RuntimeError("Didn't converge.")
113
114 lr = LogisticRegression()
115 lr.beta = map(float, beta)
116 return lr
117
119 """calculate(lr, x) -> list of probabilities
120
121 Calculate the probability for each class. lr is a
122 LogisticRegression object. x is the observed data. Returns a
123 list of the probability that it fits each class.
124
125 """
126
127 x = numpy.asarray([1.0] + x)
128
129 ebetaX = numpy.exp(numpy.dot(lr.beta, x))
130 p = ebetaX / (1+ebetaX)
131 return [1-p, p]
132
134 """classify(lr, x) -> 1 or 0
135
136 Classify an observation into a class.
137
138 """
139 probs = calculate(lr, x)
140 if probs[0] > probs[1]:
141 return 0
142 return 1
143