Skip to content
Permalink
40dd4250a1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
62 lines (52 sloc) 2.57 KB
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
import re
class Binning:
def __init__(self, data, dim, desired_bins_count, global_min=None):
self.desired_bins_count = desired_bins_count if desired_bins_count is None or data.shape[0] > desired_bins_count \
else data.shape[0]
self.dim = dim
self.data = data
self.global_min = global_min
# todo old (small reminder) in the original ipd it is NOT equal binning
# Series of binned points (with dropDuplicates produces not equally frequent bins)
def equal_frequency_binning_by_rank(self):
self.rank_data = self.data.rank(method='first')
self.bins_count = self.desired_bins_count
self.qcut = pd.qcut(self.rank_data[self.dim], self.bins_count)
return self.qcut
def equal_frequency_binning_by_rank_int_categories(self):
self.equal_frequency_binning_by_rank()
self.qcut = self.qcut.cat.rename_categories([i for i in range(self.desired_bins_count)]).reindex(
self.qcut.index)
return self.qcut
def equal_frequency_binning_duplicate_drop(self):
# todo python361
# qcut = pd.qcut(self.data[self.dim], self.desired_bins_count, duplicates='drop')
# todo python 342
qcut = self._compute_qcut()
# qcut = qcut.cat.remove_unused_categories()
bounds = [float(re.search(', (-*\d+\.*\d*e*-*\d*)', c).group(1)) for c in qcut.cat.categories]
# including global_min with a margin of 1
bounds.insert(0, self.global_min - 1)
self.bounds = pd.Series(bounds)
self.bins_count = len(qcut.cat.categories)
self.qcut = qcut.cat.rename_categories([i for i in range(self.bins_count)]).reindex(qcut.index)
return self.qcut
def _compute_qcut(self):
quantiles = np.linspace(0, 1, self.desired_bins_count + 1)
bins = algos.quantile(self.data[self.dim], quantiles)
bins = pd.unique(bins)
qcut = pd.cut(self.data[self.dim], bins, include_lowest=True)
return qcut
def interpolate(self, other_bin):
if self.bounds is None:
raise ValueError('No bounds!')
other_col = other_bin[self.dim]
if max(other_col) > self.bounds.max():
self.bounds = self.bounds.append(pd.Series(max(other_col)), ignore_index=True)
self.qcut.cat.add_categories(self.bins_count, inplace=True)
self.bins_count += 1
data_ = pd.cut(other_col, self.bounds)
return data_.cat.rename_categories([i for i in range(self.bins_count)]).reindex(data_.index)