[SSUDA] 캐글 이용자 2021 설문조사 결과 분석
!pip install kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c kaggle-survey-2021
!unzip kaggle-survey-2021.zip
import gc # For Memory Optimization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # Not sure if I used this
from wordcloud import WordCloud
from scipy.stats import norm
# Some more necessary libraries (These are for drawing the image on the bar charts)
import matplotlib.font_manager as fm
from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox
import matplotlib.image as mpimg
# To Avoid unnecessary warnings
import warnings
warnings.filterwarnings('ignore')
# Since there are many columns, I would like to view them all
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 400)
df = pd.read_csv('kaggle_survey_2021_responses.csv')
df = df.iloc[1:,:] # The first row was describing the columns. Better to look at the description from the Metadata file provided
df.head(3).style.set_properties(**{"background-color": "#76c5d6","color": "black", "border-color": "black"})
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])
df['Q1'].value_counts()
fig, ax = plt.subplots(figsize=(25,10), facecolor="w")
# Method for image
def make_img(img,zoom, x, y):
img = mpimg.imread(img)
imagebox = OffsetImage(img, zoom=zoom)
ab = AnnotationBbox(imagebox, (x,y),frameon=False)
ax.add_artist(ab)
img_file = "https://www.freeiconspng.com/thumbs/crown-icon/queen-crown-icon-4.png"
zoom = 1
img_y= 4.8
# Creating a DataFrame to get the values and their counts (this was for my purpose)
# new_df = pd.DataFrame(df['Q1'].value_counts())
# I wanted to have the highest value in the middle, so i wrote the following two code lines
age_bucket = ['70+','55-59','45-49','35-39','22-24','25-29','18-21','30-34','40-44','50-54','60-69'] #new_df.index
age_bucket_cnt = [128,592,1375,2504,4694,4931,4901,3441,1890,964,553] #list(new_df.Q1.values)
color = ['#E6E6E6', '#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6'] # Deciding the color
width = [0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8] # The Width
alpha = [0.3, 0.45, 0.5, 0.6, 0.75, 1.0, 0.75, 0.6, 0.5, 0.45, 0.3] # The Opacity
fontsize= [20, 20, 20, 20, 25, 35, 30, 20, 20, 20, 20]
x_num = [0,1,2,3,4,5,6,7,8,9,10]
for i in range(11):
plt.bar(x=age_bucket[i],height=age_bucket_cnt[i], width=width[i], color=color[i], alpha=alpha[i])
plt.text(s=age_bucket[i],x=x_num[i],y=age_bucket_cnt[i],va='bottom',ha='center',fontsize=fontsize[i], alpha=alpha[i])
plt.text(s="Age Bucket of all Kagglers",x=5,y=5500, fontsize=50,va='bottom',ha='center',color='#189AB4')
# Placing the image
make_img(img_file,0.2, 5, 4700)
gc.collect() # For Memory Optimization
plt.axis('off')
plt.show()
확실히 대학생이나 취업 준비생이 많이 이용하는 느낌이다.
다만 18-21세 연령대 이용률이 생각보다 높은 것이 신기했다.
df['Q2'].value_counts()
Gender = ['Man', 'Woman', 'Others']
# Setting size in Chart based on
# given values
Gender_cnt = [20598, 4890, 485]
# colors
colors = ['#E6E6E6', '#189AB4', '#FFFF00',
'#ADFF2F', '#FFA500']
# explosion
explode = (0.05, 0.05, 0.2)
plt.figure(figsize=[20,10])
# Pie Chart
plt.pie(Gender_cnt, colors=colors,
autopct='%1.1f%%', pctdistance=1.2,
explode=explode,)
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
plt.legend(Gender, loc = "upper right",title="Genders", prop={'size': 15})
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
plt.rcParams['font.size'] = 25
# Adding Title of chart
plt.text(s="Gender Diversity in Kaggle",x=0,y=1.3, fontsize=50,va='bottom',ha='center',color='#189AB4')
gc.collect()
# Displaing Chart
plt.show()
남자가 약 80%, 여자가 약 18%이고 기타 이유(공개 희망 안함, 미 기제 등) 2% 입니다.
확실히 남성이 주류인 분야인 것 같습니다.
df['Q3'].value_counts()
!pip install geopandas
import geopandas as gpd
# List of countries we are interested in
lis_countries = ["Algeria","Argentina","Australia","Austria","Bangladesh","Belarus","Belgium","Brazil","Canada","Chile","China","Colombia",
"Czechia","Denmark","Ecuador","Egypt","Ethiopia","France","Germany","Ghana","Greece","India","Indonesia","Iraq","Ireland",
"Israel","Italy","Japan","Kazakhstan","Kenya","Malaysia","Mexico","Morocco","Nepal","Netherlands","Nigeria","Norway","Pakistan",
"Peru","Philippines","Poland","Portugal","Romania","Russia","Saudi Arabia","South Africa","South Korea","Spain","Sri Lanka",
"Sweden","Switzerland","Taiwan","Thailand","Tunisia","Turkey","Uganda","Ukraine","United Arab Emirates","United Kingdom",
"United States of America","Vietnam"]
# Reading the geopandas data
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
country_data = lis_countries # Passing the list of countries here
country_geo = list(world['name']) # The country list from the geopandas dataset
# List of all the values of population of Kagglers from each country
lis_pop = [44,182,264,51,317,51,65,751,331,102,814,225,63,48,50,482,43,401,470,99,102,7434,444,43,84,138,311,921,45,248,156,279,140,75,153,
702,45,530,117,108,219,119,61,742,89,146,359,454,106,81,71,334,123,109,416,47,186,111,550,2650,277]
# Next we need to create a dataframe with lis_countries and lis_pop
our_country_analysis = pd.DataFrame(lis_countries, columns=['Country'])
our_country_analysis['KagglePopulation'] = lis_pop
# Next, we are going to visualize this...
mapped = world.set_index('name').join(our_country_analysis.set_index('Country')).reset_index()
to_be_mapped = 'KagglePopulation'
vmin, vmax = 0,10000
fig, ax = plt.subplots(1, figsize=(25,30))
mapped.dropna().plot(column=to_be_mapped, cmap='cividis', linewidth=0.8, ax=ax, edgecolors='1', alpha=0.7)
ax.text(s="Kagglers All Around the Globe",x=0,y=100, fontsize=50,va='bottom',ha='center',color='#189AB4')
ax.set_axis_off()
sm = plt.cm.ScalarMappable(cmap='cividis', norm=plt.Normalize(vmin=vmin, vmax=vmax))
sm._A = []
gc.collect()
cbar = fig.colorbar(sm, orientation='vertical', shrink= .25)
포화도가 높을 수록 노란색에 가까워 지는 것을 알 수 있습니다.
인도 사람들이 확실히 많이 이용하는 모습이군요.
중간중간 하얗게 빈 나라들도 있습니다.
df['Q4'].value_counts()
fig, ax = plt.subplots(figsize=(25,10), facecolor="w")
# Method for image
def make_img(img,zoom, x, y):
img = mpimg.imread(img)
imagebox = OffsetImage(img, zoom=zoom)
ab = AnnotationBbox(imagebox, (x,y),frameon=False)
ax.add_artist(ab)
img_file = "https://www.freeiconspng.com/thumbs/crown-icon/queen-crown-icon-4.png"
zoom = 1
img_y= 4.8
# I wanted to have the highest value in the middle, so i wrote the following two code lines
age_bucket = ['Professional Doctorate','High School','Bachelor’s degree','Master’s degree','Doctoral degree','Others','No Answer']
age_bucket_cnt = [360,417,9907,10132,2795,1735,627]
color = ['#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6'] # Deciding the color
width = [0.8, 0.8, 0.9, 0.9, 0.9, 0.8, 0.8,] # The Width
alpha = [0.5, 0.6, 0.75, 1.0, 0.75, 0.6, 0.5] # The Opacity
fontsize= [12, 16, 18, 21, 16, 16, 16]
x_num = [0,1,2,3,4,5,6]
for i in range(7):
plt.bar(x=age_bucket[i],height=age_bucket_cnt[i], width=width[i], color=color[i], alpha=alpha[i])
plt.text(s=age_bucket[i],x=x_num[i],y=age_bucket_cnt[i],va='bottom',ha='center',fontsize=fontsize[i], alpha=alpha[i])
plt.text(s="Educational Qualifications of all Kagglers",x=3,y=11000, fontsize=50,va='bottom',ha='center',color='#189AB4')
# Placing the image
make_img(img_file,0.25, 3, 9500)
gc.collect() # For Memory Optimization
plt.axis('off')
plt.show()
대부분의 캐글 이용자들은 학사 이상의 학위를 가지고 있습니다.
(Master's degree : 석사, Bachelor's degree : 학사, Doctoral degree : 박사 학위)
df['Q5'].value_counts()
# Method for image
def make_img(img,zoom, x, y):
img = mpimg.imread(img)
imagebox = OffsetImage(img, zoom=zoom)
ab = AnnotationBbox(imagebox, (x,y),frameon=False)
ax.add_artist(ab)
img_file = "https://www.freeiconspng.com/thumbs/crown-icon/queen-crown-icon-4.png"
zoom = 1
img_y= 4.8
fig, ax = plt.subplots(figsize=(25,10), facecolor="w")
# Creating a DataFrame to get the values and their counts (this was for my purpose)
# new_df = pd.DataFrame(df['Q1'].value_counts())
# I wanted to have the highest value in the middle, so i wrote the following two code lines
age_bucket = ['Developer\n Relations\n/Advocacy','Statistician','Data\n Engineer','Business\n Analyst','Research\n Scientist','Data\n Analyst','Software\n Engineer','Student',
'Data\n Scientist','Other','Unemployed','ML\n Engineer','Project\n Manager','Product\n Manager','DB\n Engineer'] #new_df.index
age_bucket_cnt = [99,313,668,968,1538,2301,2449,6804,3414,2393,1986,1499,849,319,171] #list(new_df.Q1.values)
color = ['#E6E6E6', '#189AB4', '#E6E6E6', '#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4','#E6E6E6','#189AB4', '#E6E6E6'] # Deciding the color
width = [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8] # The Width
alpha = [0.3, 0.45, 0.3, 0.45, 0.5, 0.6, 0.75, 1.0, 0.75, 0.6, 0.5, 0.45, 0.3, 0.3, 0.45] # The Opacity
fontsize= [12, 12, 14, 14, 14, 14, 18, 20, 16, 14, 12, 14, 14, 12, 12]
x_num = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
for i in range(15):
plt.bar(x=age_bucket[i],height=age_bucket_cnt[i], width=width[i], color=color[i], alpha=alpha[i])
plt.text(s=age_bucket[i],x=x_num[i],y=age_bucket_cnt[i],va='bottom',ha='center',fontsize=fontsize[i], alpha=alpha[i])
plt.text(s="Current Role of all Kagglers",x=7.5,y=7500, fontsize=50,va='bottom',ha='center',color='#189AB4')
# Placing the image
make_img(img_file,0.15, 7, 6500)
gc.collect() # For Memory Optimization
plt.axis('off')
plt.show()
작성자의 예측과 다르게 학생이 압도적으로 높은 수치가 나왔습니다.
(대부분이 ML 전문가나 데이터 분석가가 나올것이라고 생각한 것 같아요.)
여기서 주목할 점이 Others 입니다. 꽤 상위권에 위치하는데요.
타 분야 사람이 캐글 이용에 적극적인 것으로 생각할 수 있는데요. 데이터 분석이 많은 분야에서 응용될 수 있다는 것을 보여주는 것 같아요.
df['Q6'].value_counts()
years_bin = ['1-3years','<1years','3-5years','5-10years','10-20years','20+years','Never Coded']
years_cnt = [7874, 5881, 4061, 3099, 2166, 1860, 1032]
fig = plt.figure(figsize=(20,10))
plt.barh(width=years_cnt, y=years_bin, height=0.7, color = ['#189AB4', '#189AB4','#189AB4','#E6E6E6','#E6E6E6', '#E6E6E6', '#E6E6E6'], alpha=0.8)
##################### For the Years of Experience ###################################
s1 = ['1-3years','<1years','3-5years','5-10years','10-20years','20+years','Never Coded']
x1 = [8874, 6881, 5061, 4099, 3366, 2860, 2432]
y1 = [0,1,2,3,4,5,6]
for i in range(7):
plt.text(s = s1[i], x=x1[i], y=y1[i] ,fontsize=25,va='center',ha='right',alpha=0.8)
plt.title("Average Years of Programming Experience of Kagglers", fontsize=42, pad=20, color='#189AB4')
plt.axis('off')
plt.gca().invert_yaxis()
plt.show()
캐글 내에 생각보다 코딩 경력이 오래된 사람이 많지 않습니다.
젏은 플렛폼이라고도 생각할 수 있고, 초보자가 접근하기 어렵지 않다고도 생각할 수 있겠네요.
df['Q7_Part_1'].value_counts()
df['Q7_Part_2'].value_counts()
Tool = ['Python', 'R']
# Setting size in Chart based on
# given values
Tool_cnt = [21860, 5334]
# colors
colors = ['#E6E6E6', '#189AB4']
# explosion
explode = (0.05, 0.05)
plt.figure(figsize=[20,10])
# Pie Chart
plt.pie(Tool_cnt, colors=colors,
autopct='%1.1f%%', pctdistance=1.2,
explode=explode,)
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
plt.legend(Tool, loc = "upper right",title="Programming Languages", prop={'size': 15})
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
plt.rcParams['font.size'] = 25
# Adding Title of chart
plt.text(s="Which Programming Tool do they Prefer?",x=0,y=1.3, fontsize=50,va='bottom',ha='center',color='#189AB4')
gc.collect()
# Displaing Chart
plt.show()
파이썬과 R 이외에 다른 선택지도 있었고, 중복 선택이 허용된 문항이지만 작성자는 파이썬과 R만을 비교했습니다.
파이썬이 80% 이상으로 압도적인 사용률을 보였는데요.
앞서 조사한 결과에서 학생인 사람이 많고, 타 분야 전문가도 많기 때문에 쉬운 언어인 파이썬의 사용률이 높지 않을까 생각했어요.
df['Q8'] = df['Q8'].apply(lambda x: 'Others' if x not in ['Python','R','SQL'] else x)
df['Q8'].value_counts()
Tool = ['Python', 'R', 'SQL', 'Others']
# Setting size in Chart based on
# given values
Tool_cnt = [20213, 1445, 1338, 2977]
# colors
colors = ['#E6E6E6', '#189AB4', '#FFFF00', '#ADFF2F']
# explosion
explode = (0.05, 0.05, 0.05, 0.05)
plt.figure(figsize=[20,10])
# Pie Chart
plt.pie(Tool_cnt, colors=colors,
autopct='%1.1f%%', pctdistance=1.2,
explode=explode,)
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
plt.legend(Tool, loc = "upper right",title="Programming Languages", prop={'size': 15})
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
plt.rcParams['font.size'] = 25
# Adding Title of chart
plt.text(s="What do they Recommend for Data Science?",x=0,y=1.3, fontsize=50,va='bottom',ha='center',color='#189AB4')
gc.collect()
# Displaing Chart
plt.show()
앞선 조사와 비슷한데, 차이점은 중복선택이 안된다는 점입니다.
선택지가 꽤 많았는데도 파이썬이 압도적인 선택률을 보이네요.
df['Q9_Part_1'].value_counts()
df['Q9_Part_2'].value_counts()
이런식으로 값을 추출해서 적용한 것 같아요.
name = ['JupyterLab','RStudio','Visual Studio','VS Code','PyCharm','Spyder','Notepad++','Sublime Text','Vim/Emacs','MATLAB','Jupyter Notebook','None','Other']
value = [5488,4771,4110,10040,7468,3794,3937,2839,1646,2203,16233,526,1491]
# Creating a dataframe to store this information
df_nine_ = pd.DataFrame(name, columns=['IDE'])
df_nine_['Values'] = value
df_nine_ = df_nine_.sort_values(by="Values", ascending=False)
df_nine_
fig = plt.figure(figsize=(20,10))
plt.barh(width=list(df_nine_['Values'].unique()), y=list(df_nine_['IDE'].unique()), height=0.7, color = ['#189AB4', '#189AB4', '#189AB4', '#E6E6E6','#E6E6E6','#E6E6E6','#E6E6E6', '#E6E6E6', '#E6E6E6', '#E6E6E6', '#E6E6E6', '#E6E6E6', '#E6E6E6'], alpha=0.8)
##################### For the Years of Experience ###################################
s1 = list(df_nine_['IDE'].unique())
x1 = [19833,12040,9468,7788,6471,6810,6437,5294,5539,4003,3946,2691,1726]
y1 = [0,1,2,3,4,5,6,7,8,9,10,11,12]
for i in range(13):
plt.text(s = s1[i], x=x1[i], y=y1[i] , fontsize=25,va='center',ha='right',alpha=0.8)
plt.title("Preferred IDE of Kagglers", fontsize=42, pad=20, color='#189AB4')
plt.axis('off')
plt.gca().invert_yaxis()
gc.collect()
plt.show()
주피터 노트북이 사용자 친화적이라고 코멘트를 합니다. 시프트+엔터시 결과물이 바로 나와 편리하다는 근거와 함께.
VS CODE는 다른 언어(C) 할때 저도 사용했는데, 깃허브와 연동이 좋아서 사용이 편리합니다. 역시 많은 사용자가 이용하는 것 같아요.
파이참도 저는 써보진 않았지만 높은 순위를 기록합니다.
R을 사용하는 사람 비율 대비 R스튜디오도 많이 쓰는 모습을 보이는데, 대부분에 R 사용자가 R스튜디오를 사용한다고 생각됩니다.
df['Q10_Part_1'].value_counts()
df['Q10_Part_2'].value_counts()
코랩 노트북, 캐글 노트북 이용자 이외는 Other로 생각한 것 같습니다.
def make_img(img,zoom, x, y):
img = mpimg.imread(img)
imagebox = OffsetImage(img, zoom=zoom)
ab = AnnotationBbox(imagebox, (x,y),frameon=False)
ax.add_artist(ab)
img_file = "https://www.freeiconspng.com/thumbs/crown-icon/queen-crown-icon-4.png"
zoom = 1
img_y= 4.8
# Visualizing the Hosted Notebooks. (Hidden Input)
fig, ax = plt.subplots(figsize=(25,10), facecolor="w")
age_bucket = ['None','Colab Notebook','Kaggle Notebook']
age_bucket_cnt = [7174,9792,9507]
color = ['#E6E6E6','#189AB4','#E6E6E6'] # Deciding the color
width = [0.9, 0.9, 0.9] # The Width
alpha = [0.55, 1.0, 0.75] # The Opacity
fontsize= [25, 45, 30]
x_num = [0,1,2]
for i in range(3):
plt.bar(x=age_bucket[i],height=age_bucket_cnt[i], width=width[i], color=color[i], alpha=alpha[i])
plt.text(s=age_bucket[i],x=x_num[i],y=age_bucket_cnt[i],va='bottom',ha='center',fontsize=fontsize[i], alpha=alpha[i])
plt.text(s="Preferred Hosted Notebooks",x=1,y=11000, fontsize=50,va='bottom',ha='center',color='#189AB4')
# Placing the image
make_img(img_file,0.3, 1, 9000)
gc.collect() # For Memory Optimization
plt.axis('off')
plt.show()
코랩 노트북과 캐글 노트북의 사용자 수가 비슷합니다.
코랩 노트북은 점유율 1위로, GPU 사용이 일부 가능하고 구글 드라이브와 연동이 잘된다는 점을 큰 장점으로 소개합니다.
물론 캐글 이용자 조사이기 때문에 캐글 데이터와 캐글 노트북 간 호완성, 접근성이 좋아서 캐글 노트북 사용자가 다소 많이 집계됬습니다.
다만 캐글 노트북 만에 분명한 장점이 있겠죠? 한번 어느 환경인지 기회될때 탐색하는 것도 좋을 것 같아요.
또 특이한 점은 두 노트북 이외 각자의 PC환경을 사용하는 사람도 꽤 많다는 것입니다.
df['Q12_Part_1'].value_counts()
df['Q12_Part_2'].value_counts()
df['Q12_Part_3'].value_counts()
df['Q12_Part_4'].value_counts()
df['Q12_Part_5'].value_counts()
df['Q12_OTHER'].value_counts()
name = ["None","NVIDIA GPUs","Google Cloud TPUs","Other","AWS Inferentia Chips","AWS Trainium Chips"]
count = [13234,8036,3451,867,416,414]
# Visualizing using a barh:
fig = plt.figure(figsize=(20,10))
plt.barh(width=count, y=name, height=0.7, color = ['#E6E6E6', '#189AB4', '#189AB4', '#E6E6E6','#E6E6E6','#E6E6E6'], alpha=0.8)
##################### For the Years of Experience ###################################
s1 = name
x1 = [14234,10236,6651,2067,3916,3714]
y1 = [0,1,2,3,4,5]
for i in range(6):
plt.text(s = s1[i], x=x1[i], y=y1[i] , fontsize=25,va='center',ha='right',alpha=0.8)
plt.title("Specialized Hardware", fontsize=42, pad=20, color='#189AB4')
plt.axis('off')
plt.gca().invert_yaxis()
gc.collect()
plt.show()
GPU나 TPU를 사용하지 않는 캐글 사용자가 상당히 많이 있네요.
대회참가를 위한 데이터 공부가 아니라 설문조사를 시각화 하는 공부였습니다.
이쁘게 시각화 하기 위해서 작성자가 다양하게 노력한 모습을 확인했습니다.
또한 설문조사가 캐글 이용자 관련 설문조사라서 결과에 대해 더 흥미롭게 확인 한 것 같아요.
가볍게 공부하기 좋은 데이터 셋인것 같습니다.
대회 출처 : https://www.kaggle.com/c/kaggle-survey-2021
코드 출처 : https://www.kaggle.com/vivek468/what-s-up-kaggle-kaggle-survey-2021