毕设帮助,选题指导,技术解答,欢迎打扰,见B站个人主页
https://space.bilibili.com/33886978
# 读取数据
df = pd.read_csv('data.csv')
df.head()
df.info()
print('去重前:',df.shape[0],'行数据')
print('去重后:',df.drop_duplicates().shape[0],'行数据')
print(np.sum(df.isnull()))
df['date'] = df['date'].astype('datetime64[ns]')
df['real_time'] = df['real_time'].astype('datetime64[ns]')
df['uid'] = df['uid'].astype('str')
df['user_city'] = df['user_city'].astype('str')
df['user_city'] = df['user_city'].apply(lambda x:x[:-2])
df['item_id'] = df['item_id'].astype('str')
df['author_id'] = df['author_id'].astype('str')
df['item_city'] = df['item_city'].astype('str')
df['item_city'] = df['item_city'].apply(lambda x:x[:-2])
df['music_id'] = df['music_id'].astype('str')
df['music_id'] = df['music_id'].apply(lambda x:x[:-2])df.info()
user_city_count = user_info.groupby(['user_city']).count().sort_values(by=['uid'],ascending=False)
x1 = list(user_city_count.index)
y1 = user_city_count['uid'].tolist()
len(y1)
#柱形图代码
chart = Bar()
chart.add_xaxis(x1)
chart.add_yaxis('地区使用人数', y1, color='#F6325A',
itemstyle_opts={'barBorderRadius':[60, 60, 20, 20]},
label_opts=opts.LabelOpts(position='top'))
chart.set_global_opts(datazoom_opts=opts.DataZoomOpts(
range_start=0,range_end=5,orient='horizontal',type_='slider',is_zoom_lock=False, pos_left='1%' ),
visualmap_opts=opts.VisualMapOpts(is_show = False,type_='opacity',range_opacity=[0.2, 1]),
title_opts=opts.TitleOpts(title="不同地区用户数量分布图",pos_left='40%'),
legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'))
chart.render_notebook()
h_num = round((df.groupby(['H']).count()['uid']/10000),1).to_list()
h = list(df.groupby(['H']).count().index)
chart = Line()
chart.add_xaxis(h)
chart.add_yaxis('观看数/(万)',h_num, areastyle_opts=opts.AreaStyleOpts(color = '#1AF5EF',opacity=0.3),
itemstyle_opts=opts.ItemStyleOpts(color='black'),
label_opts=opts.LabelOpts(font_size=12))
chart.set_global_opts(legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
title_opts=opts.TitleOpts(title="不时间观看数量分布图",pos_left='40%'),)
chart.render_notebook()
left = df.groupby(['H']).sum()[['finish','like']]
right = df.groupby(['H']).count()['uid']
per = pd.concat([left,right],axis=1)
per['finish_radio'] = round(per['finish']*100/per['uid'],2)
per['like_radio'] = round(per['like']*100/per['uid'],2)
x = list(df.groupby(['H']).count().index)
y1 = per['finish_radio'].to_list()
y2 = per['like_radio'].to_list()#建立一个基础的图形
chart1 = Line()
chart1.add_xaxis(x)
chart1.add_yaxis('完播率/%',y1,is_smooth=True,label_opts=opts.LabelOpts(is_show=False),is_symbol_show = False,
linestyle_opts=opts.LineStyleOpts(color='#F6325A',opacity=.7,curve=0,width=2,type_= 'solid' ))
chart1.set_global_opts(yaxis_opts = opts.AxisOpts(min_=25,max_=45))
chart1.extend_axis(yaxis=opts.AxisOpts(min_=0.4,max_=3))
#叠加折线图
chart2 = Line()
chart2.add_xaxis(x)
chart2.add_yaxis('点赞率/%',y2,yaxis_index=1,is_smooth=True,label_opts=opts.LabelOpts(is_show=False),is_symbol_show = False,
linestyle_opts=opts.LineStyleOpts(color='#1AF5EF',opacity=.7,curve=0,width=2,type_= 'solid' ))
chart1.overlap(chart2)
chart1.set_global_opts(legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
title_opts=opts.TitleOpts(title="点赞/完播率分布图",pos_left='40%'),)
chart1.render_notebook()
df['weekday'] = df['date'].dt.weekday
week = df.groupby(['weekday']).count()['uid'].to_list()
df_pair = [['周一', week[0]], ['周二', week[1]], ['周三', week[2]], ['周四', week[3]], ['周五', week[4]], ['周六', week[5]], ['周日', week[6]]]chart = Pie()
chart.add('', df_pair,radius=['40%', '70%'],rosetype='radius',center=['45%', '50%'],label_opts=opts.LabelOpts(is_show=True,formatter = '{b}:{c}次'))
chart.set_global_opts(visualmap_opts=[opts.VisualMapOpts(min_=200000,max_=300000,type_='color', range_color=['#1AF5EF', '#F6325A', '#000000'],is_show=True,pos_top='65%')],
legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%',orient='vertical'),
title_opts=opts.TitleOpts(title="一周内播放分布图",pos_left='35%'),)
chart.render_notebook()
df.groupby(['channel']).count()['uid']
author_info = df.drop_duplicates(['author_id','item_city'])[['author_id','item_city']]
author_info.info()
author_city_count = author_info.groupby(['item_city']).count().sort_values(by=['author_id'],ascending=False)
x1 = list(author_city_count.index)
y1 = author_city_count['author_id'].tolist()df.drop_duplicates(['author_id']).shape[0]
chart = Bar()
chart.add_xaxis(x1)
chart.add_yaxis('地区创作者人数', y1, color='#F6325A',
itemstyle_opts={'barBorderRadius':[60, 60, 20, 20]})
chart.set_global_opts(datazoom_opts=opts.DataZoomOpts(
range_start=0,range_end=5,orient='horizontal',type_='slider',is_zoom_lock=False, pos_left='1%' ),
visualmap_opts=opts.VisualMapOpts(is_show = False,type_='opacity',range_opacity=[0.2, 1]),
legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
title_opts=opts.TitleOpts(title="不同城市创作者分布图",pos_left='40%'))
chart.render_notebook()
time = df.drop_duplicates(['item_id'])[['item_id','duration_time']]
time = time.groupby(['duration_time']).count()
x1 = list(time.index)
y1 = time['item_id'].tolist()
chart = Bar()
chart.add_xaxis(x1)
chart.add_yaxis('视频时长对应视频数', y1, color='#1AF5EF',
itemstyle_opts={'barBorderRadius':[60, 60, 20, 20]},
label_opts=opts.LabelOpts(font_size=12, color='black'))
chart.set_global_opts(datazoom_opts=opts.DataZoomOpts(
range_start=0,range_end=50,orient='horizontal',type_='slider'),
visualmap_opts=opts.VisualMapOpts(max_=100000,min_=200,is_show = False,type_='opacity',range_opacity=[0.4, 1]),
legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
title_opts=opts.TitleOpts(title="不同时长作品分布图",pos_left='40%'))
chart.render_notebook()
like_per = 100*np.sum(df['like'])/len(df['like'])
finish_per = 100*np.sum(df['finish'])/len(df['finish'])gauge = Gauge()
gauge.add("",[("视频互动率", like_per),['完播率',finish_per]],detail_label_opts=opts.LabelOpts(is_show=False,font_size=18),
axisline_opts=opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(
color=[(0.3, "#1AF5EF"), (0.7, "#F6325A"), (1, "#000000")],width=20)))
gauge.render_notebook()
df_cor = df[['finish','like','duration_time','H']] # 只选取部分
cor_table = df_cor.corr(method='spearman')
cor_array = np.array(cor_table)
cor_name = list(cor_table.columns)
value = [[i, j, cor_array[i,j]] for i in [3,2,1,0] for j in [0,1,2,3]] heat = HeatMap()
heat.add_xaxis(cor_name)
heat.add_yaxis("",cor_name,value,label_opts=opts.LabelOpts(is_show=True, position="inside"))
heat.set_global_opts(visualmap_opts=opts.VisualMapOpts(is_show=False, max_=0.08, range_color=["#1AF5EF", "#F6325A", "#000000"]))
heat.render_notebook()
temp = df['date'].to_list()
puv = df.groupby(['date']).agg({'uid':'nunique','item_id':'count'})
uv = puv['uid'].to_list()
pv = puv['item_id'].to_list()
time = puv.index.to_list()chart1 = Line()
chart1.add_xaxis(time)
chart1.add_yaxis('uv',uv,is_smooth=True,label_opts=opts.LabelOpts(is_show=False),is_symbol_show = False,
linestyle_opts=opts.LineStyleOpts(color='#1AF5EF',opacity=.7,curve=0,width=2,type_= 'solid' ))
chart1.add_yaxis('pv',pv,is_smooth=True,label_opts=opts.LabelOpts(is_show=False),is_symbol_show = False,
linestyle_opts=opts.LineStyleOpts(color='#F6325A',opacity=.7,curve=0,width=2,type_= 'solid' ))
chart1.render_notebook()
lc = []
for i in range(len(time)-7):
bef = set(list(df[df['date']==time[i]]['uid']))
aft = set(list(df[df['date']==time[i+7]]['uid']))
stay = bef&aft
per = round(100*len(stay)/len(bef),2)
lc.append(per)
lc1 = []
for i in range(len(time)-1):
bef = set(list(df[df['date']==time[i]]['uid']))
aft = set(list(df[df['date']==time[i+1]]['uid']))
stay = bef&aft
per = round(100*len(stay)/len(bef),2)
lc1.append(per)x7 = time[0:-7]
chart1 = Line()
chart1.add_xaxis(x7)
chart1.add_yaxis('七日留存率/%',lc,is_smooth=True,label_opts=opts.LabelOpts(is_show=False),is_symbol_show = False,
linestyle_opts=opts.LineStyleOpts(color='#F6325A',opacity=.7,curve=0,width=2,type_= 'solid' ))
chart1.set_global_opts(legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
title_opts=opts.TitleOpts(title="用户留存率分布图",pos_left='40%'),)
chart1.render_notebook()
df1 = df.groupby(['uid']).agg({'item_id':'count','like':'sum','finish':'sum'})
df1['like_per'] = df1['like']/df1['item_id']
df1['finish_per'] = df1['finish']/df1['item_id']
ndf1 = np.array(df1[['item_id','like_per','finish_per']])#.shapekmeans_per_k = [KMeans(n_clusters=k).fit(ndf1) for k in range(1,8)]
inertias = [model.inertia_ for model in kmeans_per_k]
chart = Line(init_opts=opts.InitOpts(width='560px',height='300px'))
chart.add_xaxis(range(1,8))
chart.add_yaxis("",inertias,label_opts=opts.LabelOpts(is_show=False),
linestyle_opts=opts.LineStyleOpts(color='#F6325A',opacity=.7,curve=0,width=3,type_= 'solid' ))
chart.render_notebook()
n_cluster = 4
cluster = KMeans(n_clusters=n_cluster,random_state=0).fit(ndf1)
y_pre = cluster.labels_ # 查看聚好的类
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
silhouette_score(ndf1,y_pre) n_cluster = 3
cluster = KMeans(n_clusters=n_cluster,random_state=0).fit(ndf1)
y_pre = cluster.labels_ # 查看聚好的类
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
silhouette_score(ndf1,y_pre)
本文地址:http://lianchengexpo.xrbh.cn/quote/11415.html 迅博思语资讯 http://lianchengexpo.xrbh.cn/ , 查看更多c_ = [[],[],[]]
c_[0] = [87.998,9.1615,39.92]
c_[1] = [13.292,12.077,50.012]
c_[2] = [275.011,8.125,28.751]bar = Bar(init_opts=opts.InitOpts(theme='macarons',width='1000px',height='400px')) # 添加分类(x轴)的数据
bar.add_xaxis(['播放数','点赞率(千分之)','完播率(百分之)'])
bar.add_yaxis('0', [round(i,2) for i in c_[0]], stack='stack0')
bar.add_yaxis('1',[round(i,2) for i in c_[1]], stack='stack1')
bar.add_yaxis('2',[round(i,2) for i in c_[2]], stack='stack2')
bar.render_notebook()