Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
| Total | |
0.00% |
0 / 1 |
|
33.33% |
1 / 3 |
CRAP | |
92.68% |
76 / 82 |
| ComplementNaiveBayes | |
0.00% |
0 / 1 |
|
33.33% |
1 / 3 |
12.06 | |
92.68% |
76 / 82 |
| __construct( DataSourceInterface $dataSource, ModelInterface $model = null, Document\NormalizerInterface $documentNormalizer = null, TokenizerInterface $tokenizer = null, Token\NormalizerInterface $tokenNormalizer = null ) | |
100.00% |
1 / 1 |
4 | |
100.00% |
6 / 6 |
|||
| prepareModel() | |
0.00% |
0 / 1 |
1.00 | |
94.23% |
49 / 52 |
|||
| classify($document) | |
0.00% |
0 / 1 |
7.10 | |
87.50% |
21 / 24 |
|||
| <?php | |
| /** | |
| * This file is part of the Statistical Classifier package. | |
| * | |
| * (c) Cam Spiers <camspiers@gmail.com> | |
| * | |
| * For the full copyright and license information, please view the LICENSE | |
| * file that was distributed with this source code. | |
| */ | |
| namespace Camspiers\StatisticalClassifier\Classifier; | |
| use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface; | |
| use Camspiers\StatisticalClassifier\Model\Model; | |
| use Camspiers\StatisticalClassifier\Model\ModelInterface; | |
| use Camspiers\StatisticalClassifier\Normalizer\Document; | |
| use Camspiers\StatisticalClassifier\Normalizer\Token; | |
| use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface; | |
| use Camspiers\StatisticalClassifier\Tokenizer\Word; | |
| use Camspiers\StatisticalClassifier\Transform; | |
| /** | |
| * An implementation of a Naive Bayes classifier. | |
| * | |
| * This classifier is based off *Tackling the Poor Assumptions of Naive Bayes Text Classifiers* by Jason Rennie | |
| * @author Cam Spiers <camspiers@gmail.com> | |
| * @package Statistical Classifier | |
| */ | |
| class ComplementNaiveBayes extends Classifier | |
| { | |
| /** | |
| * Tokenizer (the way of breaking up documents) | |
| * @var TokenizerInterface | |
| */ | |
| protected $tokenizer; | |
| /** | |
| * Takes document and makes it consistent | |
| * @var Document\NormalizerInterface | |
| */ | |
| protected $documentNormalizer; | |
| /** | |
| * Takes tokenized data and makes it consistent or stem it | |
| * @var Token\NormalizerInterface | |
| */ | |
| protected $tokenNormalizer; | |
| /** | |
| * Create the Naive Bayes Classifier | |
| * @param DataSourceInterface $dataSource | |
| * @param ModelInterface $model An model to store data in | |
| * @param Document\NormalizerInterface $documentNormalizer The normalizer to make document consistent | |
| * @param TokenizerInterface $tokenizer The tokenizer to break up the documents | |
| * @param Token\NormalizerInterface $tokenNormalizer The normaizer to make tokens consistent | |
| */ | |
| public function __construct( | |
| DataSourceInterface $dataSource, | |
| ModelInterface $model = null, | |
| Document\NormalizerInterface $documentNormalizer = null, | |
| TokenizerInterface $tokenizer = null, | |
| Token\NormalizerInterface $tokenNormalizer = null | |
| ) { | |
| $this->dataSource = $dataSource; | |
| $this->model = $model ?: new Model(); | |
| $this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase(); | |
| $this->tokenizer = $tokenizer ?: new Word(); | |
| $this->tokenNormalizer = $tokenNormalizer; | |
| } | |
| /** | |
| * @inheritdoc | |
| */ | |
| public function prepareModel() | |
| { | |
| $data = $this->applyTransform( | |
| new Transform\TokenPreparation( | |
| $this->tokenizer, | |
| $this->documentNormalizer, | |
| $this->tokenNormalizer | |
| ), | |
| $this->dataSource->getData() | |
| ); | |
| $tokenCountByDocument = $this->applyTransform( | |
| new Transform\TokenCountByDocument(), | |
| $data | |
| ); | |
| $documentCount = $this->applyTransform( | |
| new Transform\DocumentCount(), | |
| $data | |
| ); | |
| unset($data); | |
| $tokenAppearanceCount = $this->applyTransform( | |
| new Transform\TokenAppearanceCount(), | |
| $tokenCountByDocument | |
| ); | |
| $tokensByCateory = $this->applyTransform( | |
| new Transform\TokensByCategory(), | |
| $tokenCountByDocument | |
| ); | |
| $tfidf = $this->applyTransform( | |
| new Transform\TFIDF(), | |
| $tokenCountByDocument, | |
| $documentCount, | |
| $tokenAppearanceCount | |
| ); | |
| unset($tokenCountByDocument); | |
| unset($tokenAppearanceCount); | |
| $documentLength = $this->applyTransform( | |
| new Transform\DocumentLength(), | |
| $tfidf | |
| ); | |
| unset($tfidf); | |
| $documentTokenCounts = $this->applyTransform( | |
| new Transform\DocumentTokenCounts(), | |
| $documentLength | |
| ); | |
| $complement = $this->applyTransform( | |
| new Transform\Complement(), | |
| $documentLength, | |
| $tokensByCateory, | |
| $documentCount, | |
| $documentTokenCounts | |
| ); | |
| unset( | |
| $documentLength, | |
| $tokensByCateory, | |
| $documentCount, | |
| $documentTokenCounts | |
| ); | |
| $this->model->setModel( | |
| $this->applyTransform( | |
| new Transform\Weight(), | |
| $complement | |
| ) | |
| ); | |
| $this->model->setPrepared(true); | |
| } | |
| /** | |
| * @inheritdoc | |
| */ | |
| public function classify($document) | |
| { | |
| $results = array(); | |
| if ($this->documentNormalizer) { | |
| $document = $this->documentNormalizer->normalize($document); | |
| } | |
| $tokens = $this->tokenizer->tokenize($document); | |
| if ($this->tokenNormalizer) { | |
| $tokens = $this->tokenNormalizer->normalize($tokens); | |
| } | |
| $tokens = array_count_values($tokens); | |
| $weights = $this->preparedModel()->getModel(); | |
| foreach (array_keys($weights) as $category) { | |
| $results[$category] = 0; | |
| foreach ($tokens as $token => $count) { | |
| if (array_key_exists($token, $weights[$category])) { | |
| $results[$category] += $count * $weights[$category][$token]; | |
| } | |
| } | |
| } | |
| asort($results, SORT_NUMERIC); | |
| $category = key($results); | |
| $value = array_shift($results); | |
| if ($value === array_shift($results)) { | |
| return false; | |
| } else { | |
| return $category; | |
| } | |
| } | |
| } |