Open In Colab

구글 연동(로컬환경에서는 불필요함)

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
path = '/content/drive/MyDrive/bigdatas/'

넘파이

import numpy as np

ar1= np.arange(15).reshape(3,5) # arange, reshape
ar1
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])
A = np.array([[1,2], [3,4]])
B = np.array([[2,3], [4,5]])

A@B # 넘파이 내적연산
array([[10, 13],
       [22, 29]])
np.random.randn(3,2)
array([[ 0.41582846,  0.45521608],
       [ 0.9416985 , -0.06815956],
       [ 0.48875541,  0.53657774]])
np.random.randint(1, 100, size=6).reshape(3,2)
array([[43, 60],
       [16, 85],
       [72, 13]])
B = np.arange(12).reshape (3,4)
B
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
B.sum(axis=0) # 열 기준 연산
array([12, 15, 18, 21])
B.mean(axis=1) # 행 기준 연산
array([1.5, 5.5, 9.5])
x = np.arange(30).reshape((5,6))[:,::2] # 열 기준 0,2,4 컬럼 뽑기
x
array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22],
       [24, 26, 28]])

판다스

import pandas as pd

# 데이터프레임 딕셔너리로 만들기
dic= { 'gender' : [ 1, 2, 1,2], 'bloodtype': ["A", "B", "O", "AB"]}
df1= pd.DataFrame(dic)
df1
gender bloodtype
0 1 A
1 2 B
2 1 O
3 2 AB
dates = pd.date_range("20210309", periods=6)  
# 데이터프레임 넘파이로 만들기
df = pd.DataFrame(np.random.randn(6, 4), index= dates, columns=list("ABCD"))
df
A B C D
2021-03-09 1.408594 -0.360413 -0.658324 -1.524480
2021-03-10 -0.199638 0.808160 0.973753 -0.213467
2021-03-11 -0.312141 -0.199871 0.492419 -0.654314
2021-03-12 1.626976 -0.194653 -1.064668 -1.527266
2021-03-13 -0.248378 -0.575600 -2.008171 -1.580801
2021-03-14 0.114982 -0.451940 1.172818 0.967527
df.tail() # 밑에서부터 5개 출력
A B C D
2021-03-10 -0.199638 0.808160 0.973753 -0.213467
2021-03-11 -0.312141 -0.199871 0.492419 -0.654314
2021-03-12 1.626976 -0.194653 -1.064668 -1.527266
2021-03-13 -0.248378 -0.575600 -2.008171 -1.580801
2021-03-14 0.114982 -0.451940 1.172818 0.967527
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.398399 -0.162386 -0.182029 -0.755467
std 0.882163 0.497636 1.262302 1.013996
min -0.312141 -0.575600 -2.008171 -1.580801
25% -0.236193 -0.429058 -0.963082 -1.526570
50% -0.042328 -0.280142 -0.082952 -1.089397
75% 1.085191 -0.195957 0.853420 -0.323679
max 1.626976 0.808160 1.172818 0.967527
df.sort_values(by="B") # B 기준으로 정렬
A B C D
2021-03-13 -0.248378 -0.575600 -2.008171 -1.580801
2021-03-14 0.114982 -0.451940 1.172818 0.967527
2021-03-09 1.408594 -0.360413 -0.658324 -1.524480
2021-03-11 -0.312141 -0.199871 0.492419 -0.654314
2021-03-12 1.626976 -0.194653 -1.064668 -1.527266
2021-03-10 -0.199638 0.808160 0.973753 -0.213467
df=pd.read_csv(path + "studentlist.csv", encoding="cp949")
df
name sex age grade absence bloodtype height weight dept
0 김길동 남자 23 3 O 165.3 68.2 2
1 이미린 여자 22 2 AB 170.1 53.0 3
2 홍길동 남자 24 4 B 175.0 80.1 1
3 김철수 남자 23 3 AB 182.1 85.7 2
4 손세수 여자 20 1 A 168.0 49.5 1
5 박미희 여자 21 2 O 162.0 52.0 3
6 강수친 여자 22 1 O 155.2 45.3 2
7 이희수 여자 23 1 A 176.9 55.0 1
8 이철린 남자 23 3 B 178.5 64.2 1
9 방희철 남자 22 2 B 176.1 61.3 3
10 박수호 남자 24 4 O 167.1 62.0 3
11 임동민 남자 22 2 AB 180.0 75.8 3
12 김민수 남자 21 1 A 162.2 55.3 1
13 이희진 여자 23 3 O 176.1 53.1 2
14 김미진 여자 22 2 B 158.2 45.2 3
15 김동수 남자 24 4 B 168.6 70.2 1
16 여수근 남자 21 1 A 169.2 62.2 2
df[0:3]
name sex age grade absence bloodtype height weight dept
0 김길동 남자 23 3 O 165.3 68.2 2
1 이미린 여자 22 2 AB 170.1 53.0 3
2 홍길동 남자 24 4 B 175.0 80.1 1
df[["age", "grade"]]
age grade
0 23 3
1 22 2
2 24 4
3 23 3
4 20 1
5 21 2
6 22 1
7 23 1
8 23 3
9 22 2
10 24 4
11 22 2
12 21 1
13 23 3
14 22 2
15 24 4
16 21 1
df.iloc[1:5, 0:3]  # 행, 열
name sex age
1 이미린 여자 22
2 홍길동 남자 24
3 김철수 남자 23
4 손세수 여자 20
df.loc[df["bloodtype"].isin( ["B", "A"] ), ["name", "age"]]
name age
2 홍길동 24
4 손세수 20
7 이희수 23
8 이철린 23
9 방희철 22
12 김민수 21
14 김미진 22
15 김동수 24
16 여수근 21
df2 = df[1:3]   # 2개 row
df3 = df[5:8]
df4 = pd.concat([df2, df3], axis=0) # 행 기준으로 합체
df4
name sex age grade absence bloodtype height weight dept
1 이미린 여자 22 2 AB 170.1 53.0 3
2 홍길동 남자 24 4 B 175.0 80.1 1
5 박미희 여자 21 2 O 162.0 52.0 3
6 강수친 여자 22 1 O 155.2 45.3 2
7 이희수 여자 23 1 A 176.9 55.0 1
dff= pd.read_csv(path+"footsize.csv", encoding="cp949")
df3= pd.merge(df, dff, left_on='name', right_on='realname')
df3
name sex age grade absence bloodtype height weight dept realname footsize
0 김길동 남자 23 3 O 165.3 68.2 2 김길동 275
1 이미린 여자 22 2 AB 170.1 53.0 3 이미린 245
2 김철수 남자 23 3 AB 182.1 85.7 2 김철수 280
3 손세수 여자 20 1 A 168.0 49.5 1 손세수 240
4 박미희 여자 21 2 O 162.0 52.0 3 박미희 240
5 강수친 여자 22 1 O 155.2 45.3 2 강수친 245
6 이희수 여자 23 1 A 176.9 55.0 1 이희수 245
7 이철린 남자 23 3 B 178.5 64.2 1 이철린 260
8 방희철 남자 22 2 B 176.1 61.3 3 방희철 275
9 박수호 남자 24 4 O 167.1 62.0 3 박수호 280
10 임동민 남자 22 2 AB 180.0 75.8 3 임동민 280
11 김민수 남자 21 1 A 162.2 55.3 1 김민수 270
12 이희진 여자 23 3 O 176.1 53.1 2 이희진 245
13 김미진 여자 22 2 B 158.2 45.2 3 김미진 235
14 김동수 남자 24 4 B 168.6 70.2 1 김동수 265
15 여수근 남자 21 1 A 169.2 62.2 2 여수근 265
df.groupby(["sex","bloodtype"]).mean()
age grade height weight dept
sex bloodtype
남자 A 21.00 1.00 165.700000 58.750000 1.500000
AB 22.50 2.50 181.050000 80.750000 2.500000
B 23.25 3.25 174.550000 68.950000 1.500000
O 23.50 3.50 166.200000 65.100000 2.500000
여자 A 21.50 1.00 172.450000 52.250000 1.000000
AB 22.00 2.00 170.100000 53.000000 3.000000
B 22.00 2.00 158.200000 45.200000 3.000000
O 22.00 2.00 164.433333 50.133333 2.333333

맷플로립

import matplotlib.pyplot as plt

plt.plot( df[["height", "weight"]] )
[<matplotlib.lines.Line2D at 0x7fe80e23cc50>,
 <matplotlib.lines.Line2D at 0x7fe80e1f5e50>]
plt.plot( df["height"], df["weight"] , 'gs') # 색깔/기호(o, ^, s, -, --)
[<matplotlib.lines.Line2D at 0x7fe80e6ceb10>]
names = ['group_a', 'group_b', 'group_c']  # X
values = [1, 10, 100] # Y

plt.figure(figsize=(7, 3))  #전체 가로세로 크기

plt.subplot(231)   # 2 x 3 구성의 1번
plt.bar(names, values)  
plt.subplot(232)
plt.scatter(names, values)
plt.subplot(236)
plt.plot(names, values)
plt.suptitle('Categorical Plotting')
Text(0.5, 0.98, 'Categorical Plotting')
plt.scatter ( "height", "weight", c="grade" ,data=df) # c : 유형별 컬러 다르게
<matplotlib.collections.PathCollection at 0x7fe80db70fd0>

기술통계

import seaborn as sns
import pandas as pd

titanic= sns.load_dataset("titanic")
# 클래스 별로 점 색깔 다르게 하기.
sns.scatterplot(x = 'fare', y = 'age', data = titanic, hue='class')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7fec33310>
titanic.to_csv("titanic.csv", index=False) 
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
print(titanic.dtypes)
print()
titanic.dtypes [ titanic.dtypes==np.int64 ]
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

survived    int64
pclass      int64
sibsp       int64
parch       int64
dtype: object
titanic['survived'].astype(float)
0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: survived, Length: 891, dtype: float64
titanic.sex.value_counts(normalize=True)
male      0.647587
female    0.352413
Name: sex, dtype: float64
titanic.survived.value_counts().plot(kind='pie') # 'bar'
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7feb47ed0>
sns.countplot(x='survived', data=titanic)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7feaa7950>
print(titanic['fare'].quantile(0.5), titanic['fare'].quantile(0.75))
plt.boxplot(titanic.fare, vert=False)
14.4542 31.0
{'boxes': [<matplotlib.lines.Line2D at 0x7fe7fea3fb50>],
 'caps': [<matplotlib.lines.Line2D at 0x7fe7fea46b90>,
  <matplotlib.lines.Line2D at 0x7fe7fea4f110>],
 'fliers': [<matplotlib.lines.Line2D at 0x7fe7fea4fbd0>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x7fe7fea4f690>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7fe7fea46110>,
  <matplotlib.lines.Line2D at 0x7fe7fea46650>]}
numeric_columns = ['age', 'fare']

for i in range(1,3):
    c = numeric_columns[i-1]
    plt.subplot(2,1,i)
    plt.title(c)
    plt.boxplot(titanic[c].dropna(), vert=False) # vert : 세로로 하기
    plt.show()
sns.histplot(x='fare', data=titanic, bins=[0,100,200,300,400,500,600]) # 또는 bins= 갯수
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7fe95e0d0>
titanic.dropna(subset=['age'])
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
885 0 3 female 39.0 0 5 29.1250 Q Third woman False NaN Queenstown no False
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

714 rows × 15 columns

titanic.age.fillna( titanic.age.median()) # 중앙값 대체
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64
pd.Categorical( titanic.who, categories=["man", "woman"])
['man', 'woman', 'woman', 'woman', 'man', ..., 'man', 'woman', 'woman', 'man', 'man']
Length: 891
Categories (2, object): ['man', 'woman']

통계검정

import scipy.stats as st
import numpy as np

sample1=titanic.sample (n=30)
sample2=titanic.sample (frac=0.2)

# 소규모 t검정 자유도 n-1
print ("sample1:", np.mean(sample1.fare), st.t.interval(alpha=0.95, df=len(sample1)-1, loc=np.mean(sample1.fare), scale=st.sem(sample1.fare)))
# 대규모 정규분포 검정.
print ("sample2:", np.mean(sample2.fare), st.norm.interval(alpha=0.95, loc=np.mean(sample2.fare), scale=st.sem(sample2.fare)))
print ("population:", np.mean(titanic.fare))
sample1: 24.72999666666667 (7.08208569698947, 42.37790763634386)
sample2: 25.84440393258429 (20.87124582777264, 30.817562037395938)
population: 32.2042079685746
st.ttest_1samp(sample1.fare, 20)
Ttest_1sampResult(statistic=0.5481628622491003, pvalue=0.5877752933697968)
from scipy.stats import chisquare

num_class=sample2["class"].value_counts()
print (num_class)
print (chisquare (num_class))
Third     102
First      40
Second     36
Name: class, dtype: int64
Power_divergenceResult(statistic=46.15730337078652, pvalue=9.485689908542853e-11)