diff --git "a/docs/views/data/2021-03-27-\345\246\202\344\275\225\345\210\251\347\224\250pandas\345\201\232\347\256\200\345\215\225\347\232\204\346\225\260\346\215\256\345\210\206\346\236\220.md" "b/docs/views/data/2021-03-27-\345\246\202\344\275\225\345\210\251\347\224\250pandas\345\201\232\347\256\200\345\215\225\347\232\204\346\225\260\346\215\256\345\210\206\346\236\220.md" index 2940e9a0..5b1abaf8 100644 --- "a/docs/views/data/2021-03-27-\345\246\202\344\275\225\345\210\251\347\224\250pandas\345\201\232\347\256\200\345\215\225\347\232\204\346\225\260\346\215\256\345\210\206\346\236\220.md" +++ "b/docs/views/data/2021-03-27-\345\246\202\344\275\225\345\210\251\347\224\250pandas\345\201\232\347\256\200\345\215\225\347\232\204\346\225\260\346\215\256\345\210\206\346\236\220.md" @@ -1,6 +1,7 @@ --- layout: post title: 如何用pandas做简单的数据分析 + date: 2021-03-27 author: LZY categories: diff --git "a/docs/views/data/2021-11-08-week1\345\255\246\344\271\240\345\206\205\345\256\271.md" "b/docs/views/data/2021-11-08-week1\345\255\246\344\271\240\345\206\205\345\256\271.md" new file mode 100644 index 00000000..43a20225 --- /dev/null +++ "b/docs/views/data/2021-11-08-week1\345\255\246\344\271\240\345\206\205\345\256\271.md" @@ -0,0 +1,32 @@ +--- +layout: post +title: week1学习内容 +date: 2021-11-08 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - 数据分析 + - Python +--- + +## python: + +1. pandas + +算法数据结构: + +1. 决策树 +1. 随机森林 +2. 栈和队列 + +数学: + +1. 二维随机变量的分布 + +其他: + +1. 学习了基本的markdown语法 +2. 利用Typora书写markdown +3. 安装好了pandoc,配置好了上传博客的基础工具 + diff --git "a/docs/views/data/2021-11-25-MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" "b/docs/views/data/2021-11-25-MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" new file mode 100644 index 00000000..09920885 --- /dev/null +++ "b/docs/views/data/2021-11-25-MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" @@ -0,0 +1,212 @@ +--- +layout: post +title: MySQL和数据可视化 +date: 2021-11-25 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - MySQL + - Python + - 函数画图 +--- + +## week3 + +### MySQL进阶1 + +1. 查询🐱🐉 + + - 排序查询 + + ```mysql + select * from person order by age desc,id asc; + ``` + +2. 函数🐱🚀 + + - 单行函数 + + 1. 字符函数 + + - concat + + ```mysql + select concat('这瓜','保熟','吗'); + ``` + + - length(返回字节长度) + + ```mysql + select length('张三a123'); + ``` + + - substr/substring + + ```mysql + SELECT SUBSTR('今天希望你开心',5,3); + ``` + + - upper & lower + + ```mysql + select upper('abAb1'); + select lower('abAb1'); + ``` + + - instr + + ```mysql + select instr('泊松分布','分布'); + ``` + + - trim + + ```mysql + select trim(' abcd '); + select trim('ab' from 'ab abcc b'); + ``` + + - lpad & rpad + + ```mysql + SELECT LPAD('哥谭市',8,'*'); + SELECT RPAD('哥谭市',8,'*'); + ``` + + - replace + + ```mysql + SELECT REPLACE('想登上高山欲穷千里目','想','不想'); + ``` + + + + 2. 数学函数 + + - round + + ```mysql + select round(1.22,1); + ``` + + - ceil & floor(向上、向下取整) + + ```mysql + select ceil(1.9); + select floor(1.9); + ``` + + - truncate(保留几位小数) + + ```mysql + select truncate(1.231313,3); + ``` + + - mod + + ```mysql + select mod(10,3); + ``` + + 3. 日期函数 + + - now + - curdate + - curtime + + 4. 其他函数 + + 5. 流程控制函数 + + - 分组函数(统计使用) + + + +### 绘图 + +#### 绘制正态分布:jack_o_lantern: + +1. 利用随机数绘画:baby_chick: + + - 首先利用numpy生成随机标准正态分布数组 + + ```python + import numpy as np + np.random.seed(0) + data = np.random.standard_normal(100000000) + data + ``` + + ```python + array([ 1.76405235, 0.40015721, 0.97873798, ..., 0.32191089, + 0.25199669, -1.22612391]) + ``` + + - 然后使用matplotlib绘出图像 + + ```python + import matplotlib.pyplot as plt + %matplotlib inline + plt.hist(data,1000) + ``` + +  + + + +2. 利用sympy画图:label: + + - ```mysql + from sympy import * + from sympy.stats import Normal,density + ``` + + - ```mysql + y = symbols('y') + x = symbols('x') + y = Normal(y,0,1) + plot(density(y)(x)) + ``` + + - ```python + density(y)(x) + ``` + + -  + + - + +  + + + +绘制其他函数 + +1. sympy + + - ```python + plot(x,pow(x,2)) + ``` + + -  + +2. matplotlib + + - ```python + x = np.arange(1,10,0.01) + y = np.log10(x) + u = np.arange(1,10,0.01) + w = np.exp(u) + ``` + + - ```python + plt.style.use('ggplot') + fig,ax = plt.subplots(1,2,figsize=(8,4)) + ax[0].plot(x,y,label='log10',color='r') + ax[0].legend(loc='best') + ax[1].plot(u,w,label='ex',color='b') + ax[1].legend(loc='best') + ``` + + -  diff --git "a/docs/views/data/2021-12-3-\346\234\264\347\264\240\350\264\235\345\217\266\346\226\257\347\256\227\346\263\225\345\256\236\347\216\260\346\226\207\346\234\254\345\210\206\347\261\273.md" "b/docs/views/data/2021-12-3-\346\234\264\347\264\240\350\264\235\345\217\266\346\226\257\347\256\227\346\263\225\345\256\236\347\216\260\346\226\207\346\234\254\345\210\206\347\261\273.md" new file mode 100644 index 00000000..098c6c2f --- /dev/null +++ "b/docs/views/data/2021-12-3-\346\234\264\347\264\240\350\264\235\345\217\266\346\226\257\347\256\227\346\263\225\345\256\236\347\216\260\346\226\207\346\234\254\345\210\206\347\261\273.md" @@ -0,0 +1,119 @@ +--- +layout: post +title: 朴素贝叶斯算法实现文本分类 +date: 2021-12-3 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - Python + - 文本分类 +--- + +## 文本分类 + +现实中的文本复杂多样,文本分类和文本情感分析是我们开展机器学习的重要组成部分。 + +以下将用一个案例来实现对文本的分类。 + +- 首先导入原始的数据 + + 这里我们使用一个对餐厅评价的数据集 + + ```python + import pandas as pd + data = pd.read_csv('./restaurant.csv',encoding='gb18030') + data + ``` + +  + +- 紧接着对每条数据附上标签,将star高于3的划分为1,反之则为0 + + ```python + import numpy as np + star = np.array(data.star) + star[star <= 3] = 0 + star[star > 3] = 1 + data['label'] = star + data + ``` + +  + +- 然后我们对每条评论进行切词并且新增加一列“words” + + ```python + import jieba + data['words'] = data['comment'].apply(lambda x:' '.join(jieba.lcut(x,cut_all=True))) + data + ``` + +  + +- 对数据集进行训练集和测试集的划分 + + ```python + from sklearn.model_selection import train_test_split + x_train,x_test,y_train,y_test = train_test_split(data.words,data.label,test_size=0.2,random_state=42) + ``` + +- 导入文本特征提取方法 + + ```python + from sklearn.feature_extraction.text import CountVectorizer + ``` + +- 计算次数 + + ```python + counter = CountVectorizer() + x_train = counter.fit_transform(x_train) + x_test = counter.transform(x_test) + ``` + +- 画出图表 + + ```python + amount = x_train.toarray() + name = counter.get_feature_names() + result = pd.DataFrame(data=amount,columns=name) + result + ``` + +  + +- 搭建模型 + + ```python + from sklearn.naive_bayes import MultinomialNB + estimator = MultinomialNB() + estimator.fit(x_train,y_train) + ``` + + ```python + y_predict = estimator.predict(x_test) + ``` + +  + +- 计算准确率 + + ```python + estimator.score(x_test,y_test) + ``` + + $$ + 0.8475 + $$ + + + +- 查看测试集和预测目标值的正确率 + + ```python + np.array(y_test == y_predict) + ``` + +  + diff --git "a/docs/views/data/2022-4-1-seaborn\347\273\230\345\233\276.md" "b/docs/views/data/2022-4-1-seaborn\347\273\230\345\233\276.md" new file mode 100644 index 00000000..91a51def --- /dev/null +++ "b/docs/views/data/2022-4-1-seaborn\347\273\230\345\233\276.md" @@ -0,0 +1,558 @@ +--- +layout: post +title: seaborn绘图 +date: 2022-4-1 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - Python + - seaborn +--- + +```python +import seaborn as sns +``` + + +```python +from sklearn.datasets import load_iris,load_boston +import pandas as pd +``` + + +```python +category_dataset = load_iris() +category_data = pd.DataFrame(category_dataset.data,columns=category_dataset.feature_names) +category_data['species'] = category_dataset.target +category_data +``` + +
| + | sepal length (cm) | +sepal width (cm) | +petal length (cm) | +petal width (cm) | +species | +
|---|---|---|---|---|---|
| 0 | +5.1 | +3.5 | +1.4 | +0.2 | +0 | +
| 1 | +4.9 | +3.0 | +1.4 | +0.2 | +0 | +
| 2 | +4.7 | +3.2 | +1.3 | +0.2 | +0 | +
| 3 | +4.6 | +3.1 | +1.5 | +0.2 | +0 | +
| 4 | +5.0 | +3.6 | +1.4 | +0.2 | +0 | +
| ... | +... | +... | +... | +... | +... | +
| 145 | +6.7 | +3.0 | +5.2 | +2.3 | +2 | +
| 146 | +6.3 | +2.5 | +5.0 | +1.9 | +2 | +
| 147 | +6.5 | +3.0 | +5.2 | +2.0 | +2 | +
| 148 | +6.2 | +3.4 | +5.4 | +2.3 | +2 | +
| 149 | +5.9 | +3.0 | +5.1 | +1.8 | +2 | +
150 rows × 5 columns
+ + +```python +regression_dataset = load_boston() +regression_data = pd.DataFrame(regression_dataset.data,columns=regression_dataset.feature_names) +regression_data['target'] = regression_dataset.target +regression_data +``` + +| + | CRIM | +ZN | +INDUS | +CHAS | +NOX | +RM | +AGE | +DIS | +RAD | +TAX | +PTRATIO | +B | +LSTAT | +target | +
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | +0.00632 | +18.0 | +2.31 | +0.0 | +0.538 | +6.575 | +65.2 | +4.0900 | +1.0 | +296.0 | +15.3 | +396.90 | +4.98 | +24.0 | +
| 1 | +0.02731 | +0.0 | +7.07 | +0.0 | +0.469 | +6.421 | +78.9 | +4.9671 | +2.0 | +242.0 | +17.8 | +396.90 | +9.14 | +21.6 | +
| 2 | +0.02729 | +0.0 | +7.07 | +0.0 | +0.469 | +7.185 | +61.1 | +4.9671 | +2.0 | +242.0 | +17.8 | +392.83 | +4.03 | +34.7 | +
| 3 | +0.03237 | +0.0 | +2.18 | +0.0 | +0.458 | +6.998 | +45.8 | +6.0622 | +3.0 | +222.0 | +18.7 | +394.63 | +2.94 | +33.4 | +
| 4 | +0.06905 | +0.0 | +2.18 | +0.0 | +0.458 | +7.147 | +54.2 | +6.0622 | +3.0 | +222.0 | +18.7 | +396.90 | +5.33 | +36.2 | +
| ... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
| 501 | +0.06263 | +0.0 | +11.93 | +0.0 | +0.573 | +6.593 | +69.1 | +2.4786 | +1.0 | +273.0 | +21.0 | +391.99 | +9.67 | +22.4 | +
| 502 | +0.04527 | +0.0 | +11.93 | +0.0 | +0.573 | +6.120 | +76.7 | +2.2875 | +1.0 | +273.0 | +21.0 | +396.90 | +9.08 | +20.6 | +
| 503 | +0.06076 | +0.0 | +11.93 | +0.0 | +0.573 | +6.976 | +91.0 | +2.1675 | +1.0 | +273.0 | +21.0 | +396.90 | +5.64 | +23.9 | +
| 504 | +0.10959 | +0.0 | +11.93 | +0.0 | +0.573 | +6.794 | +89.3 | +2.3889 | +1.0 | +273.0 | +21.0 | +393.45 | +6.48 | +22.0 | +
| 505 | +0.04741 | +0.0 | +11.93 | +0.0 | +0.573 | +6.030 | +80.8 | +2.5050 | +1.0 | +273.0 | +21.0 | +396.90 | +7.88 | +11.9 | +
506 rows × 14 columns
+# 数值 + +## distribution+plot + + +```python +sns.displot(data['sepal length (cm)'],rug=True,palette='pastel',kind='hist') +``` + + + + +## 核密度估计图 + + +```python +sns.kdeplot('sepal width (cm)','petal width (cm)',data=category_data,shade=True,palette=sns.color_palette('husl',2)) +``` + + + + + + +## 双变量分布图 + + +```python +sns.jointplot(data=category_data) +``` + + + + + + +## 多变量分布图 + + +```python +sns.pairplot(data=category_data.iloc[:,:-1]) +``` + + + + + + +## 变量关系图 + + +```python +sns.relplot(x='sepal length (cm)',y='sepal width (cm)',data=category_data,hue='species',style='species',palette='pastel') +``` + + + + + + +## 散点图 + + +```python +sns.scatterplot(data=category_data.iloc[:,:-1]) +``` + + + + + + +## 线图 + + +```python +sns.lineplot(x='sepal length (cm)',y='sepal width (cm)',data=category_data,estimator=None,hue='species', + style='species',markers=True,units='sepal width (cm)') +``` + + + + + + +## 回归图 + + +```python +sns.regplot(x='sepal length (cm)',y='sepal width (cm)',color='pink',marker='*',order=2,data=category_data) +``` + + + + + +```python +sns.regplot(x=category_data['sepal length (cm)'],y=category_data['sepal width (cm)'],data=category_data) +``` + + + + + + +## 残差分布图 + + +```python +sns.residplot(x='sepal length (cm)',y='sepal width (cm)',data=category_data,color='red') +``` + + + + + +## 热力图 + + +```python +sns.heatmap(category_data.iloc[:,:-1].corr(),cbar=False,fmt='.2f',annot=True,linewidths=0.5,linecolor='gray',square=True,cmap='YlGnBu_r') +``` + + + + +```python +sns.clustermap(category_data.iloc[:,:-1].corr(),annot=True) +``` + + + + +# 分类 + +## 散点图 + + +```python +sns.stripplot(x='species',y='sepal width (cm)',data=category_data) +``` + + + + + +```python +sns.swarmplot(x='species',y='sepal width (cm)',data=category_data) +``` + + + + + +## 箱线图 + + +```python +sns.boxplot(x='species',y='sepal width (cm)',data=category_data) +``` + + + + + +```python +sns.boxenplot(x='species',y='sepal width (cm)',data=category_data) +``` + + + + + + +## 小提琴图 + + +```python +sns.violinplot(x='species',y='sepal width (cm)',data=category_data,kind='violin',split=True,bw=0.5,cut=.5,inner='stick',palette='pastel') +``` + + + + + + +## 统计图 + + +```python +sns.pointplot(x='species',y='sepal width (cm)',data=category_data,ci=50) #均值 +``` + + + + + +```python +sns.barplot(x='species',y='sepal width (cm)',data=category_data,ci=50) +``` + + + + + diff --git "a/docs/views/data/2022-4-7-\346\225\260\345\255\246\346\250\241\345\236\213.md" "b/docs/views/data/2022-4-7-\346\225\260\345\255\246\346\250\241\345\236\213.md" new file mode 100644 index 00000000..b123784d --- /dev/null +++ "b/docs/views/data/2022-4-7-\346\225\260\345\255\246\346\250\241\345\236\213.md" @@ -0,0 +1,192 @@ +--- +layout: post +title: 数学建模常用模型 +date: 2022-4-7 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - Python + - 数学建模 +--- + +```python +import numpy as np +from scipy import optimize +``` + +# 线性规划 + +$$ +2x_1+3x_2+x_3 +$$ + +$$ +\begin{cases} +x_1,x_2,x_3>0 \\ +x_1+x_2+x_3<8 \\ +x_1+4x_2+3x_3>15 \\ +x_1+3x_2+4x_3=8 +\end{cases} +$$ + + +```python +c = [2,3,1] +A_ub = [[1,1,1],[-1,-4,-3]] +b_ub = [8,-15] +A_eq = [[1,3,4]] +b_eq = [8] +bounds = [[0,None],[0,None],[0,None]] +``` + + +```python +optimize.linprog(c,A_ub,b_ub,A_eq,b_eq,bounds=bounds) +``` + + + con: array([-2.82744494e-10]) + fun: 7.762393895046949 + message: 'The algorithm terminated successfully and determined that the problem is infeasible.' + nit: 4 + slack: array([ 4.57100406, -6.09559799]) + status: 2 + success: False + x: array([1.41971392, 1.45684202, 0.55244001]) + +# 指派问题 + + +```python +assign = np.random.randint(10,size=(3,4)) +assign +``` + + + array([[2, 4, 3, 8], + [0, 3, 2, 0], + [9, 5, 0, 4]]) + + +```python +opt = optimize.linear_sum_assignment(assign) +opt +``` + + + (array([0, 1, 2]), array([0, 3, 2], dtype=int64)) + + +```python +sum(assign[opt]) +``` + + + 2 + +# 非线性规划 + + +```python +optimize.minimize(lambda x:x**2+2*x+1,x0=0) +``` + + + fun: 0.0 + hess_inv: array([[0.5]]) + jac: array([0.]) + message: 'Optimization terminated successfully.' + nfev: 6 + nit: 2 + njev: 3 + status: 0 + success: True + x: array([-1.00000001]) + +# 求函数的零点和方程组的解 + + +```python +import sympy +``` + + +```python +x = sympy.Symbol('x') +solution = sympy.solve(x**2+3*x) +solution +``` + + + [-3, 0] + + +```python +for i in solution: + print(i.evalf()) +``` + + -3.00000000000000 + 0 + +```python +x,y = sympy.symbols('x y') +``` + + +```python +s = sympy.solve((x+y-2,x-y)) +s +``` + + + {x: 1, y: 1} + + +```python +optimize.root(lambda x:x**2+3*x,x0=-4) +``` + + + fjac: array([[-1.]]) + fun: array([0.]) + message: 'The solution converged.' + nfev: 9 + qtf: array([-2.04813944e-12]) + r: array([3.00000001]) + status: 1 + success: True + x: array([-3.]) + +# 微分方程 + + +```python +from sympy import * +``` + + +```python +y = Function('y') +x = symbols('x') +``` + + +```python +eq = Eq(diff(diff(y(x),x),x)-(1+diff(y(x),x)),0) +eq +``` + +$$ +\displaystyle - \frac{d}{d x} y{\left(x \right)} + \frac{d^{2}}{d x^{2}} y{\left(x \right)} - 1 = 0 +$$ + + +```python +dsolve(eq) +``` + +$$ +\displaystyle y{\left(x \right)} = C_{1} + C_{2} e^{x} - x +$$ diff --git "a/docs/views/data/MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" "b/docs/views/data/MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" new file mode 100644 index 00000000..2462e98c --- /dev/null +++ "b/docs/views/data/MySQL\345\222\214\346\225\260\346\215\256\345\217\257\350\247\206\345\214\226.md" @@ -0,0 +1,230 @@ +--- +layout: post +title: MySQL和数据可视化 +date: 2021-11-19 +author: 饶翰宇 +categories: + - 数据分析部 +tags: + - MySQL + - Python + - 数据可视化 +--- + +## week2 + +------ + +### MySQL + +1. sql基本语法 + +- 显示当前数据库 + +```mysql +show databases; +``` + +- 创建数据库 + +```mysql +create database blocks; +``` + +- 使用数据库 + +```mysql +use blocks; +``` + +- 创建表 + +```mysql +create table person(id int(10), + name varchar(15), + age int(3)); +``` + ++ 查看表结构 + +```mysql +desc person; +``` + +- 显示当前已创建的表 + +```mysql +show tables; +``` + +- 在表中插入值 + +```mysql +insert into person(id,name,age) values(1,'华强',33); +insert into person(id,name,age) values(2,'胖虎',15); +insert into person(id,name,age) values(3,'大司马',37); +insert into person(id,name,age) values(4,NULL,1); +``` + +2. 基础查询 + +- 查询全部 + +```mysql +select * from person; +``` + + + + + +- 查询指定列 + +```mysql +select id,name from person; +``` + +- 查询常量 + +```mysql +select 100; +select '大熊'; +``` + +- 查询表达式 + +```mysql +select 100*100; +``` + +- 查询函数 + +```mysql +select version(); +``` + +3. 其他操作 + +- 起别名 + +```mysql +select name as '姓名' from person; +``` + +- concat连接 + +```mysql +select concat(id,name) as info from person; +``` + +- ifnull + +```mysql +select id,ifnull(name,0) from person; +``` + +4. 条件查询 + +- where + +```mysql +select * from person where name = '华强'; +``` + +5. 模糊查询 + +- like + +```mysql +select * from person where name like '_强'; +select * from person where name like '大%'; +``` + +- in + +```mysql +select * from person where name in ('华强'); +``` + +- between and + +```mysql +select * from person where id between 1 and 2; +``` + +- is null(is not null) + +```mysql +select * from person where id is null; +``` + + + +### 数据可视化 + +通过python我们可以实现大量数据的可视化,这里我们运用pandas对鸢尾花数据集进行演示; + +数据处理部分: + +```python +from sklearn.datasets import load_iris +import pandas as pd +``` + +```python +dataset = load_iris() +data = pd.DataFrame(data=dataset.data,columns=dataset.feature_names) +data['species'] = dataset.target_names[dataset.target] +``` + + + + + +画图处理部分: + +```python +import matplotlib as plt +plt.rcParams['font.sans-serif'] = ['SimHei'] +``` + +对类别数量进行计算: + +```python +info = data.groupby('species').groups +count = {name:len(num) for name,num in info.items()} +``` + +结果: + +```python +{'setosa': 50, 'versicolor': 50, 'virginica': 50} +``` + +开始画图: + +```python +plt.figure(figsize=(8,8)) +plt.pie(count.values(),labels=count.keys(),explode=[0.05]*3,colors=['purple','pink','yellow']) +plt.legend(loc='upper right') +plt.title('鸢尾花数据集',fontsize=30,color='black',bbox={'facecolor':'white','pad':5},loc='center') +``` + + + + + +对每个特征值进行处理并画图: + +```python +plt.figure(figsize=(8,8)) +plt.style.use('ggplot') +for i in range(1,5): + plt.subplot(2,2,i) + plt.hist(data.iloc[:,i-1],10,edgecolor='k') + plt.title(data.columns[i-1]) +``` + + + +