구글 연동(로컬환경에서는 불필요함)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

path = '/content/drive/MyDrive/bigdatas/'

넘파이

import numpy as np

ar1= np.arange(15).reshape(3,5) # arange, reshape
ar1

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

A = np.array([[1,2], [3,4]])
B = np.array([[2,3], [4,5]])

A@B # 넘파이 내적연산

array([[10, 13],
       [22, 29]])

np.random.randn(3,2)

array([[ 0.41582846,  0.45521608],
       [ 0.9416985 , -0.06815956],
       [ 0.48875541,  0.53657774]])

np.random.randint(1, 100, size=6).reshape(3,2)

array([[43, 60],
       [16, 85],
       [72, 13]])

B = np.arange(12).reshape (3,4)
B

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

B.sum(axis=0) # 열 기준 연산

array([12, 15, 18, 21])

B.mean(axis=1) # 행 기준 연산

array([1.5, 5.5, 9.5])

x = np.arange(30).reshape((5,6))[:,::2] # 열 기준 0,2,4 컬럼 뽑기
x

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22],
       [24, 26, 28]])

판다스

import pandas as pd

# 데이터프레임 딕셔너리로 만들기
dic= { 'gender' : [ 1, 2, 1,2], 'bloodtype': ["A", "B", "O", "AB"]}
df1= pd.DataFrame(dic)
df1

dates = pd.date_range("20210309", periods=6)  
# 데이터프레임 넘파이로 만들기
df = pd.DataFrame(np.random.randn(6, 4), index= dates, columns=list("ABCD"))
df

df.tail() # 밑에서부터 5개 출력

df.describe()

df.sort_values(by="B") # B 기준으로 정렬

df=pd.read_csv(path + "studentlist.csv", encoding="cp949")
df

df[0:3]

df[["age", "grade"]]

df.iloc[1:5, 0:3]  # 행, 열

df.loc[df["bloodtype"].isin( ["B", "A"] ), ["name", "age"]]

df2 = df[1:3]   # 2개 row
df3 = df[5:8]
df4 = pd.concat([df2, df3], axis=0) # 행 기준으로 합체
df4

dff= pd.read_csv(path+"footsize.csv", encoding="cp949")
df3= pd.merge(df, dff, left_on='name', right_on='realname')
df3

df.groupby(["sex","bloodtype"]).mean()

맷플로립

import matplotlib.pyplot as plt

plt.plot( df[["height", "weight"]] )

[<matplotlib.lines.Line2D at 0x7fe80e23cc50>,
 <matplotlib.lines.Line2D at 0x7fe80e1f5e50>]

plt.plot( df["height"], df["weight"] , 'gs') # 색깔/기호(o, ^, s, -, --)

[<matplotlib.lines.Line2D at 0x7fe80e6ceb10>]

names = ['group_a', 'group_b', 'group_c']  # X
values = [1, 10, 100] # Y

plt.figure(figsize=(7, 3))  #전체 가로세로 크기

plt.subplot(231)   # 2 x 3 구성의 1번
plt.bar(names, values)  
plt.subplot(232)
plt.scatter(names, values)
plt.subplot(236)
plt.plot(names, values)
plt.suptitle('Categorical Plotting')

Text(0.5, 0.98, 'Categorical Plotting')

plt.scatter ( "height", "weight", c="grade" ,data=df) # c : 유형별 컬러 다르게

<matplotlib.collections.PathCollection at 0x7fe80db70fd0>

기술통계

import seaborn as sns
import pandas as pd

titanic= sns.load_dataset("titanic")
# 클래스 별로 점 색깔 다르게 하기.
sns.scatterplot(x = 'fare', y = 'age', data = titanic, hue='class')

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7fec33310>

titanic.to_csv("titanic.csv", index=False) 
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

print(titanic.dtypes)
print()
titanic.dtypes [ titanic.dtypes==np.int64 ]

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

survived    int64
pclass      int64
sibsp       int64
parch       int64
dtype: object

titanic['survived'].astype(float)

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: survived, Length: 891, dtype: float64

titanic.sex.value_counts(normalize=True)

male      0.647587
female    0.352413
Name: sex, dtype: float64

titanic.survived.value_counts().plot(kind='pie') # 'bar'

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7feb47ed0>

sns.countplot(x='survived', data=titanic)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7feaa7950>

print(titanic['fare'].quantile(0.5), titanic['fare'].quantile(0.75))
plt.boxplot(titanic.fare, vert=False)

14.4542 31.0

{'boxes': [<matplotlib.lines.Line2D at 0x7fe7fea3fb50>],
 'caps': [<matplotlib.lines.Line2D at 0x7fe7fea46b90>,
  <matplotlib.lines.Line2D at 0x7fe7fea4f110>],
 'fliers': [<matplotlib.lines.Line2D at 0x7fe7fea4fbd0>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x7fe7fea4f690>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7fe7fea46110>,
  <matplotlib.lines.Line2D at 0x7fe7fea46650>]}

numeric_columns = ['age', 'fare']

for i in range(1,3):
    c = numeric_columns[i-1]
    plt.subplot(2,1,i)
    plt.title(c)
    plt.boxplot(titanic[c].dropna(), vert=False) # vert : 세로로 하기
    plt.show()

sns.histplot(x='fare', data=titanic, bins=[0,100,200,300,400,500,600]) # 또는 bins= 갯수

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7fe95e0d0>

titanic.dropna(subset=['age'])

titanic.age.fillna( titanic.age.median()) # 중앙값 대체

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

pd.Categorical( titanic.who, categories=["man", "woman"])

['man', 'woman', 'woman', 'woman', 'man', ..., 'man', 'woman', 'woman', 'man', 'man']
Length: 891
Categories (2, object): ['man', 'woman']

통계검정

import scipy.stats as st
import numpy as np

sample1=titanic.sample (n=30)
sample2=titanic.sample (frac=0.2)

# 소규모 t검정 자유도 n-1
print ("sample1:", np.mean(sample1.fare), st.t.interval(alpha=0.95, df=len(sample1)-1, loc=np.mean(sample1.fare), scale=st.sem(sample1.fare)))
# 대규모 정규분포 검정.
print ("sample2:", np.mean(sample2.fare), st.norm.interval(alpha=0.95, loc=np.mean(sample2.fare), scale=st.sem(sample2.fare)))
print ("population:", np.mean(titanic.fare))

sample1: 24.72999666666667 (7.08208569698947, 42.37790763634386)
sample2: 25.84440393258429 (20.87124582777264, 30.817562037395938)
population: 32.2042079685746

st.ttest_1samp(sample1.fare, 20)

Ttest_1sampResult(statistic=0.5481628622491003, pvalue=0.5877752933697968)

from scipy.stats import chisquare

num_class=sample2["class"].value_counts()
print (num_class)
print (chisquare (num_class))

Third     102
First      40
Second     36
Name: class, dtype: int64
Power_divergenceResult(statistic=46.15730337078652, pvalue=9.485689908542853e-11)

	A	B	C	D
2021-03-09	1.408594	-0.360413	-0.658324	-1.524480
2021-03-10	-0.199638	0.808160	0.973753	-0.213467
2021-03-11	-0.312141	-0.199871	0.492419	-0.654314
2021-03-12	1.626976	-0.194653	-1.064668	-1.527266
2021-03-13	-0.248378	-0.575600	-2.008171	-1.580801
2021-03-14	0.114982	-0.451940	1.172818	0.967527

	A	B	C	D
2021-03-10	-0.199638	0.808160	0.973753	-0.213467
2021-03-11	-0.312141	-0.199871	0.492419	-0.654314
2021-03-12	1.626976	-0.194653	-1.064668	-1.527266
2021-03-13	-0.248378	-0.575600	-2.008171	-1.580801
2021-03-14	0.114982	-0.451940	1.172818	0.967527

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.398399	-0.162386	-0.182029	-0.755467
std	0.882163	0.497636	1.262302	1.013996
min	-0.312141	-0.575600	-2.008171	-1.580801
25%	-0.236193	-0.429058	-0.963082	-1.526570
50%	-0.042328	-0.280142	-0.082952	-1.089397
75%	1.085191	-0.195957	0.853420	-0.323679
max	1.626976	0.808160	1.172818	0.967527

	A	B	C	D
2021-03-13	-0.248378	-0.575600	-2.008171	-1.580801
2021-03-14	0.114982	-0.451940	1.172818	0.967527
2021-03-09	1.408594	-0.360413	-0.658324	-1.524480
2021-03-11	-0.312141	-0.199871	0.492419	-0.654314
2021-03-12	1.626976	-0.194653	-1.064668	-1.527266
2021-03-10	-0.199638	0.808160	0.973753	-0.213467

	name	sex	age	grade	absence	bloodtype	height	weight	dept
0	김길동	남자	23	3	유	O	165.3	68.2	2
1	이미린	여자	22	2	무	AB	170.1	53.0	3
2	홍길동	남자	24	4	무	B	175.0	80.1	1
3	김철수	남자	23	3	무	AB	182.1	85.7	2
4	손세수	여자	20	1	유	A	168.0	49.5	1
5	박미희	여자	21	2	무	O	162.0	52.0	3
6	강수친	여자	22	1	무	O	155.2	45.3	2
7	이희수	여자	23	1	무	A	176.9	55.0	1
8	이철린	남자	23	3	무	B	178.5	64.2	1
9	방희철	남자	22	2	무	B	176.1	61.3	3
10	박수호	남자	24	4	유	O	167.1	62.0	3
11	임동민	남자	22	2	무	AB	180.0	75.8	3
12	김민수	남자	21	1	무	A	162.2	55.3	1
13	이희진	여자	23	3	무	O	176.1	53.1	2
14	김미진	여자	22	2	무	B	158.2	45.2	3
15	김동수	남자	24	4	유	B	168.6	70.2	1
16	여수근	남자	21	1	무	A	169.2	62.2	2

	age	grade
0	23	3
1	22	2
2	24	4
3	23	3
4	20	1
5	21	2
6	22	1
7	23	1
8	23	3
9	22	2
10	24	4
11	22	2
12	21	1
13	23	3
14	22	2
15	24	4
16	21	1

		age	grade	height	weight	dept
sex	bloodtype
남자	A	21.00	1.00	165.700000	58.750000	1.500000
	AB	22.50	2.50	181.050000	80.750000	2.500000
	B	23.25	3.25	174.550000	68.950000	1.500000
	O	23.50	3.50	166.200000	65.100000	2.500000
여자	A	21.50	1.00	172.450000	52.250000	1.000000
	AB	22.00	2.00	170.100000	53.000000	3.000000
	B	22.00	2.00	158.200000	45.200000	3.000000
	O	22.00	2.00	164.433333	50.133333	2.333333

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
885	0	3	female	39.0	0	5	29.1250	Q	Third	woman	False	NaN	Queenstown	no	False
886	0	2	male	27.0	0	0	13.0000	S	Second	man	True	NaN	Southampton	no	True
887	1	1	female	19.0	0	0	30.0000	S	First	woman	False	B	Southampton	yes	True
889	1	1	male	26.0	0	0	30.0000	C	First	man	True	C	Cherbourg	yes	True
890	0	3	male	32.0	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True

	gender	bloodtype
0	1	A
1	2	B
2	1	O
3	2	AB

	age	grade
0	23	3
1	22	2
2	24	4
3	23	3
4	20	1
5	21	2
6	22	1
7	23	1
8	23	3
9	22	2
10	24	4
11	22	2
12	21	1
13	23	3
14	22	2
15	24	4
16	21	1

	age	grade
0	23	3
1	22	2
2	24	4
3	23	3
4	20	1
5	21	2
6	22	1
7	23	1
8	23	3
9	22	2
10	24	4
11	22	2
12	21	1
13	23	3
14	22	2
15	24	4
16	21	1