Code Coverage for /Users/cameron/Sites/statistical-classifier-pages/statistical-classifier/src/Camspiers/StatisticalClassifier/Classifier/ComplementNaiveBayes.php

	Code Coverage
	Classes and Traits			Functions and Methods				Lines
Total		0.00%	0 / 1		33.33%	1 / 3	CRAP		92.68%	76 / 82
ComplementNaiveBayes		0.00%	0 / 1		33.33%	1 / 3	12.06		92.68%	76 / 82
__construct( DataSourceInterface $dataSource, ModelInterface $model = null, Document\NormalizerInterface $documentNormalizer = null, TokenizerInterface $tokenizer = null, Token\NormalizerInterface $tokenNormalizer = null )					100.00%	1 / 1	4		100.00%	6 / 6
prepareModel()					0.00%	0 / 1	1.00		94.23%	49 / 52
classify($document)					0.00%	0 / 1	7.10		87.50%	21 / 24

1	<?php
2
3	/**
4	* This file is part of the Statistical Classifier package.
5	*
6	* (c) Cam Spiers <camspiers@gmail.com>
7	*
8	* For the full copyright and license information, please view the LICENSE
9	* file that was distributed with this source code.
10	*/
11
12	namespace Camspiers\StatisticalClassifier\Classifier;
13
14	use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface;
15	use Camspiers\StatisticalClassifier\Model\Model;
16	use Camspiers\StatisticalClassifier\Model\ModelInterface;
17	use Camspiers\StatisticalClassifier\Normalizer\Document;
18	use Camspiers\StatisticalClassifier\Normalizer\Token;
19	use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface;
20	use Camspiers\StatisticalClassifier\Tokenizer\Word;
21	use Camspiers\StatisticalClassifier\Transform;
22
23	/**
24	* An implementation of a Naive Bayes classifier.
25	*
26	* This classifier is based off Tackling the Poor Assumptions of Naive Bayes Text Classifiers by Jason Rennie
27	* @author Cam Spiers <camspiers@gmail.com>
28	* @package Statistical Classifier
29	*/
30	class ComplementNaiveBayes extends Classifier
31	{
32	/**
33	* Tokenizer (the way of breaking up documents)
34	* @var TokenizerInterface
35	*/
36	protected $tokenizer;
37	/**
38	* Takes document and makes it consistent
39	* @var Document\NormalizerInterface
40	*/
41	protected $documentNormalizer;
42	/**
43	* Takes tokenized data and makes it consistent or stem it
44	* @var Token\NormalizerInterface
45	*/
46	protected $tokenNormalizer;
47	/**
48	* Create the Naive Bayes Classifier
49	* @param DataSourceInterface $dataSource
50	* @param ModelInterface $model An model to store data in
51	* @param Document\NormalizerInterface $documentNormalizer The normalizer to make document consistent
52	* @param TokenizerInterface $tokenizer The tokenizer to break up the documents
53	* @param Token\NormalizerInterface $tokenNormalizer The normaizer to make tokens consistent
54	*/
55	public function __construct(
56	DataSourceInterface $dataSource,
57	ModelInterface $model = null,
58	Document\NormalizerInterface $documentNormalizer = null,
59	TokenizerInterface $tokenizer = null,
60	Token\NormalizerInterface $tokenNormalizer = null
61	) {
62	$this->dataSource = $dataSource;
63	$this->model = $model ?: new Model();
64	$this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase();
65	$this->tokenizer = $tokenizer ?: new Word();
66	$this->tokenNormalizer = $tokenNormalizer;
67	}
68	/**
69	* @inheritdoc
70	*/
71	public function prepareModel()
72	{
73	$data = $this->applyTransform(
74	new Transform\TokenPreparation(
75	$this->tokenizer,
76	$this->documentNormalizer,
77	$this->tokenNormalizer
78	),
79	$this->dataSource->getData()
80	);
81
82	$tokenCountByDocument = $this->applyTransform(
83	new Transform\TokenCountByDocument(),
84	$data
85	);
86
87	$documentCount = $this->applyTransform(
88	new Transform\DocumentCount(),
89	$data
90	);
91
92	unset($data);
93
94	$tokenAppearanceCount = $this->applyTransform(
95	new Transform\TokenAppearanceCount(),
96	$tokenCountByDocument
97	);
98
99	$tokensByCateory = $this->applyTransform(
100	new Transform\TokensByCategory(),
101	$tokenCountByDocument
102	);
103
104	$tfidf = $this->applyTransform(
105	new Transform\TFIDF(),
106	$tokenCountByDocument,
107	$documentCount,
108	$tokenAppearanceCount
109	);
110
111	unset($tokenCountByDocument);
112	unset($tokenAppearanceCount);
113
114	$documentLength = $this->applyTransform(
115	new Transform\DocumentLength(),
116	$tfidf
117	);
118
119	unset($tfidf);
120
121	$documentTokenCounts = $this->applyTransform(
122	new Transform\DocumentTokenCounts(),
123	$documentLength
124	);
125
126	$complement = $this->applyTransform(
127	new Transform\Complement(),
128	$documentLength,
129	$tokensByCateory,
130	$documentCount,
131	$documentTokenCounts
132	);
133
134	unset(
135	$documentLength,
136	$tokensByCateory,
137	$documentCount,
138	$documentTokenCounts
139	);
140
141	$this->model->setModel(
142	$this->applyTransform(
143	new Transform\Weight(),
144	$complement
145	)
146	);
147
148	$this->model->setPrepared(true);
149	}
150	/**
151	* @inheritdoc
152	*/
153	public function classify($document)
154	{
155	$results = array();
156
157	if ($this->documentNormalizer) {
158	$document = $this->documentNormalizer->normalize($document);
159	}
160
161	$tokens = $this->tokenizer->tokenize($document);
162
163	if ($this->tokenNormalizer) {
164	$tokens = $this->tokenNormalizer->normalize($tokens);
165	}
166
167	$tokens = array_count_values($tokens);
168
169	$weights = $this->preparedModel()->getModel();
170
171	foreach (array_keys($weights) as $category) {
172	$results[$category] = 0;
173	foreach ($tokens as $token => $count) {
174	if (array_key_exists($token, $weights[$category])) {
175	$results[$category] += $count * $weights[$category][$token];
176	}
177	}
178	}
179
180	asort($results, SORT_NUMERIC);
181
182	$category = key($results);
183
184	$value = array_shift($results);
185
186	if ($value === array_shift($results)) {
187	return false;
188	} else {
189	return $category;
190	}
191	}
192	}