《Python数据挖掘与机器学习实战》—3.7.3 异常数据分析
3.7.3 异常数据分析
这里对于异常数据进行检测,代码如下:
def sigmoid(X):
return 1.0 / (1 + exp(-X))
# 定义sigmod函数
class logRegressClassifier(object):
def __init__( self):
self.dataMat = list()
self.labelMat = list()
self.weights = list()
#读取数据函数
def loadDataSet(self, filename):
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataLine = [1.0]
for i in lineArr:
dataLine.append(float(i))
label = dataLine.pop() # 弹出引用标签的最后一列
self.dataMat.append(dataLine)
self.labelMat.append(int(label))
self.dataMat = mat(self.dataMat)
self.labelMat = mat(self.labelMat).transpose()
#训练函数
def train(self):
self.weights = self.stocGradAscent1()
#返回权重函数,GradAscent,GradAscent1参数不同
def batchGradAscent(self):
m, n = shape(self.dataMat)
alpha = 0.001
maxCycles = 500
weights = ones((n,1))#初始化
for k in range(maxCycles):
h = sigmoid(self.dataMat * weights)
error =(self.labelMat - h)
weights += alpha * self.dataMat.transpose() * error #更新权重
return weights
def stocGradAscent1(self):
m, n = shape(self.dataMat)
alpha = 0.01
weights = ones((n, 1))
for i in range(m):
h = sigmoid(sum(self.dataMat[i] * weights))
error = self.labelMat[i] - h
weights += (alpha * error * self.dataMat[i]).transpose()
return weights
#返回权重函数
def stocGradAscent2(self):
numIter = 2
m, n = shape(self.dataMat)
weights = ones((n, 1)) #初始化
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001
#alpha随着迭代而减少
randIndex = int(random.uniform(0, len(dataIndex)))
h = sigmoid(sum(self.dataMat[randIndex] * weights))
error = self.labelMat[randIndex] - h
weights += (alpha * error * self.dataMat[randIndex]).
transpose()
del (dataIndex[randIndex])
return weights
#分类器sigmoid函数
def classify(self, X):
prob = sigmoid(sum(X * self.weights))
if prob > 0.5:
return 1.0
else:
return 0.0
#测试函数
def test(self):
self.loadDataSet('testData.dat')
weights0 = self.batchGradAscent()
weights1 = self.stocGradAscent1()
weights2 = self.stocGradAscent2()
print('batchGradAscent:', weights0)
print('stocGradAscent0:', weights1)
print('stocGradAscent1:', weights2)
df_month=[[0]*31 for i in range(12)]
# 声明一个31列、12行的二维数组,用来存储每个月每一天的数据
def classify_df():
for i in range(12):
for j in range(31):
df_month[i][j]=df.loc[(df[' month']==i+1)&(df[' day']==j+1)]
return df_month
classify_df()
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
# 调整mql横纵坐标刻度字体大小
x=list(df_month[1][1].ix[:,3])
# 截取一天二十四小时作为X轴坐标
c=np.random.randint(0,10,len(x))
# 随机生成采样点颜色
for n in range(20):
plt.scatter(x, df_month[1][n].ix[:,4].astype(float), marker='.',c=c)
# 循环打点,二月前二十天的PM2.5数据
plt.xlabel('hour')
plt.ylabel('Pm2.5:ug/m3')
plt.title('2016/2PUKOU')
plt.colorbar()
# 定义表格参数
plt.show()
if __name__ == '__main__':
lr = logRegressClassifier()
lr.test()
打印出二月份前二天PM2.5浓度分布,如图3-11所示,曲线外为异常值。
图3-11 二月份前二十天浦口某站PM2.5浓度分布
- 点赞
- 收藏
- 关注作者
评论(0)