Code Coverage for /Users/cameron/Sites/statistical-classifier-pages/statistical-classifier/src/Camspiers/StatisticalClassifier/Classifier/SVM.php

	Code Coverage
	Classes and Traits			Functions and Methods				Lines
Total		0.00%	0 / 1		0.00%	0 / 7	CRAP		0.00%	0 / 178
SVM		0.00%	0 / 1		0.00%	0 / 7	1260		0.00%	0 / 178
__construct( DataSourceInterface $dataSource, SVMModel $model = null, Document\NormalizerInterface $documentNormalizer = null, TokenizerInterface $tokenizer = null, Token\NormalizerInterface $tokenNormalizer = null, \SVM $svm = null, $threshold = null )					0.00%	0 / 1	42		0.00%	0 / 27
prepareModel()					0.00%	0 / 1	240		0.00%	0 / 87
classify($document)					0.00%	0 / 1	12		0.00%	0 / 14
prepareDocument($document, SVMModel $model)					0.00%	0 / 1	30		0.00%	0 / 20
setThreshold($threshold)					0.00%	0 / 1	12		0.00%	0 / 16
getProbabilities($document)					0.00%	0 / 1	6		0.00%	0 / 10
hasThreshold()					0.00%	0 / 1	2		0.00%	0 / 4

1	<?php
2
3	/**
4	* This file is part of the Statistical Classifier package.
5	*
6	* (c) Cam Spiers <camspiers@gmail.com>
7	*
8	* For the full copyright and license information, please view the LICENSE
9	* file that was distributed with this source code.
10	*/
11
12	namespace Camspiers\StatisticalClassifier\Classifier;
13
14	use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface;
15	use Camspiers\StatisticalClassifier\Model\SVMModel;
16	use Camspiers\StatisticalClassifier\Normalizer\Document;
17	use Camspiers\StatisticalClassifier\Normalizer\Token;
18	use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface;
19	use Camspiers\StatisticalClassifier\Tokenizer\Word;
20	use Camspiers\StatisticalClassifier\Transform;
21
22	/**
23	* Provides a text based SVM classifier which uses libsvm
24	*
25	* @author Cam Spiers <camspiers@gmail.com>
26	* @package Statistical Classifier
27	*/
28	class SVM extends Classifier
29	{
30	/**
31	* Tokenizer (the way of breaking up documents)
32	* @var TokenizerInterface
33	*/
34	protected $tokenizer;
35	/**
36	* Takes document and makes it consistent
37	* @var Document\NormalizerInterface
38	*/
39	protected $documentNormalizer;
40	/**
41	* Takes tokenized data and makes it consistent or stem it
42	* @var Token\NormalizerInterface
43	*/
44	protected $tokenNormalizer;
45	/**
46	*
47	* @var float\|bool
48	*/
49	protected $threshold;
50	/**
51	* @param DataSourceInterface $dataSource
52	* @param SVMModel $model
53	* @param Document\NormalizerInterface $documentNormalizer
54	* @param TokenizerInterface $tokenizer
55	* @param Token\NormalizerInterface $tokenNormalizer
56	* @param \SVM $svm
57	* @param null $threshold
58	*/
59	public function __construct(
60	DataSourceInterface $dataSource,
61	SVMModel $model = null,
62	Document\NormalizerInterface $documentNormalizer = null,
63	TokenizerInterface $tokenizer = null,
64	Token\NormalizerInterface $tokenNormalizer = null,
65	\SVM $svm = null,
66	$threshold = null
67	) {
68	$this->dataSource = $dataSource;
69	$this->model = $model ? : new SVMModel();
70	$this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase();
71	$this->tokenizer = $tokenizer ?: new Word();
72	$this->tokenNormalizer = $tokenNormalizer;
73	if (!$svm) {
74	$svm = new \SVM();
75	$svm->setOptions(
76	array(
77	\SVM::OPT_KERNEL_TYPE => \SVM::KERNEL_LINEAR
78	)
79	);
80	}
81	$this->svm = $svm;
82	if ($threshold) {
83	$this->setThreshold($threshold);
84	}
85	}
86	/**
87	* @inheritdoc
88	*/
89	public function prepareModel()
90	{
91	$data = $this->applyTransform(
92	new Transform\TokenPreparation(
93	$this->tokenizer,
94	$this->documentNormalizer,
95	$this->tokenNormalizer
96	),
97	$this->dataSource->getData()
98	);
99
100	$tokenCountByDocument = $this->applyTransform(
101	new Transform\TokenCountByDocument(),
102	$data
103	);
104
105	$documentLength = $this->applyTransform(
106	new Transform\DocumentLength(),
107	$this->applyTransform(
108	new Transform\TFIDF(),
109	$tokenCountByDocument,
110	$this->applyTransform(
111	new Transform\DocumentCount(),
112	$data
113	),
114	$this->applyTransform(
115	new Transform\TokenAppearanceCount(),
116	$tokenCountByDocument
117	)
118	)
119	);
120
121	$categoryMap = array();
122	$categoryCount = 0;
123	$tokenMap = array();
124	$tokenCount = 1;
125
126	// Produce the token and category maps for the whole document set
127	foreach ($documentLength as $category => $documents) {
128	if (!array_key_exists($category, $categoryMap)) {
129	$categoryMap[$category] = $categoryCount;
130	$categoryCount++;
131	}
132	foreach ($documents as $document) {
133	foreach (array_keys($document) as $token) {
134	if (!array_key_exists($token, $tokenMap)) {
135	$tokenMap[$token] = $tokenCount;
136	$tokenCount++;
137	}
138	}
139	}
140	}
141
142	// When using probabilities and our dataset is small we need to increase its
143	// size by duplicating the data
144	// see: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf section "8 Probability Estimates"
145	if ($this->hasThreshold()) {
146	foreach ($documentLength as $category => $documents) {
147	while (count($documents) <= 5) {
148	foreach ($documents as $document) {
149	$documents[] = $document;
150	}
151	}
152	$documentLength[$category] = $documents;
153	}
154	}
155
156	$transform = array();
157
158	// Prep the svm data set for use
159	foreach ($documentLength as $category => $documents) {
160	foreach ($documents as $document) {
161	$entry = array(
162	$categoryMap[$category]
163	);
164	foreach ($document as $token => $value) {
165	$entry[$tokenMap[$token]] = $value;
166	}
167	ksort($entry, SORT_NUMERIC);
168	$transform[] = $entry;
169	}
170	}
171
172	// Weight the data set by the number of docs that appear in each class.
173	$weights = array();
174
175	foreach ($documentLength as $category => $documents) {
176	$weights[$categoryMap[$category]] = count($documents);
177	}
178
179	$lowest = min($weights);
180
181	foreach ($weights as $index => $weight) {
182	$weights[$index] = $lowest / $weight;
183	}
184
185	$this->model->setMaps(array_flip($categoryMap), $tokenMap);
186
187	$this->model->setModel(
188	$this->svm->train(
189	$transform,
190	$weights
191	)
192	);
193
194	$this->model->setPrepared(true);
195	}
196	/**
197	* @inheritdoc
198	*/
199	public function classify($document)
200	{
201	/** @var SVMModel $model */
202	$model = $this->preparedModel();
203
204	$categoryMap = $model->getCategoryMap();
205
206	$data = $this->prepareDocument($document, $model);
207
208	if ($this->hasThreshold()) {
209	$probabilities = array();
210	$category = $model->getModel()->predict_probability($data, $probabilities);
211
212	return $probabilities[$category] > $this->threshold ? $categoryMap[$category] : false;
213	} else {
214	$category = $model->getModel()->predict($data);
215
216	return $categoryMap[$category];
217	}
218	}
219	/**
220	* Formats the document for use in \SVMModel
221	* @param string $document
222	* @param \Camspiers\StatisticalClassifier\Model\SVMModel $model
223	* @return array
224	*/
225	protected function prepareDocument($document, SVMModel $model)
226	{
227	$tokenMap = $model->getTokenMap();
228
229	$data = array();
230
231	if ($this->documentNormalizer) {
232	$document = $this->documentNormalizer->normalize($document);
233	}
234
235	$tokens = $this->tokenizer->tokenize($document);
236
237	if ($this->tokenNormalizer) {
238	$tokens = $this->tokenNormalizer->normalize($tokens);
239	}
240
241	$tokenCounts = array_count_values($tokens);
242
243	foreach ($tokenCounts as $token => $value) {
244	if (isset($tokenMap[$token])) {
245	$data[$tokenMap[$token]] = $value;
246	}
247	}
248
249	ksort($data, SORT_NUMERIC);
250
251	return $data;
252	}
253	/**
254	* Set the threshold probability a classifier document must meet
255	* @param float $threshold float value between 0-1
256	* @throws \InvalidArgumentException
257	*/
258	public function setThreshold($threshold)
259	{
260	if (is_numeric($threshold)) {
261	$this->threshold = $threshold;
262	$this->svm->setOptions(
263	array(
264	\SVM::OPT_PROBABILITY => true
265	)
266	);
267	if ($this->model->isPrepared()) {
268	$this->model->setPrepared(false);
269	}
270	} else {
271	throw new \InvalidArgumentException("Threshold must be a float value between 0-1");
272	}
273	}
274	/**
275	* Returns the probabilities of the document being in each category
276	* @param string $document
277	* @return array
278	*/
279	public function getProbabilities($document)
280	{
281	if ($this->hasThreshold()) {
282	$model = $this->preparedModel();
283	$data = $this->prepareDocument($document, $model);
284	$probabilities = array();
285	$model->getModel()->predict_probability($data, $probabilities);
286
287	return array_combine($model->getCategoryMap(), $probabilities);
288	}
289	}
290	/**
291	* @return bool
292	*/
293	protected function hasThreshold()
294	{
295	return $this->threshold !== null;
296	}
297	}