其他
盘一盘 2021 年程序员们喜欢的网站数据
The following article is from 萝卜大杂烩 Author 周萝卜
作者 | 周萝卜
来源 | 萝卜大杂烩
数据获取
https://www.visualcapitalist.com/the-50-most-visited-websites-in-the-world/
import pandas as pd
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
res = requests.get("https://www.visualcapitalist.com/the-50-most-visited-websites-in-the-world/", headers=headers)
soup = BeautifulSoup(res.text)
tbody = soup.find("table").find("tbody")
tr_list = tbody.find_all("tr")
data_list = []
for tr in tr_list:
tds = tr.find_all("td")
tmp = []
for td in tds:
tmp.append(td.text)
data_list.append(tmp)
print(data_list)
['2', 'Youtube.com', '34.6B', 'U.S.', 'TV Movies and Streaming'],
['3',
'Facebook.com',
'25.5B',
'U.S.',
'Social Networks and Online Communities'],
['4',
'Twitter.com',
'6.6B',
'U.S.',
'Social Networks and Online Communities'],
['5', 'Wikipedia.org', '6.1B', 'U.S.', 'Dictionaries and Encyclopedias'],
['6',
'Instagram.com',
'6.1B',
'U.S.',
'Social Networks and Online Communities'],
....
df.rename(columns={0:'Rank',1:'WebSite',2:'Traffic', 3:'Country', 4:'Type'},inplace=True)
df['new_Traffic'] = df['Traffic'].apply(lambda x: x.split("B")[0] if "B" in x else float(x.split("M")[0])/1000)
print(df)
0 1 Google.com 92.5B U.S. Search Engines 92.5
1 2 Youtube.com 34.6B U.S. TV Movies and Streaming 34.6
2 3 Facebook.com 25.5B U.S. Social Networks and Online Communities 25.5
3 4 Twitter.com 6.6B U.S. Social Networks and Online Communities 6.6
4 5 Wikipedia.org 6.1B U.S. Dictionaries and Encyclopedias 6.1
5 6 Instagram.com 6.1B U.S. Social Networks and Online Communities 6.1
6 7 Baidu.com 5.6B China Search Engines 5.6
7 8 Yahoo.com 3.8B U.S. News and Media 3.8
8 9 xvideos.com 3.4B Czech Republic Adult 3.4
9 10 pornhub.com 3.3B Canada Adult 3.3
10 11 Yandex.ru 3.2B Russia Search Engines 3.2
11 12 Whatsapp.com 3.1B U.S. Social Networks and Online Communities 3.1
12 13 Amazon.com 2.9B U.S. Marketplace 2.9
...
newdf = pd.DataFrame(np.repeat(df.values,24,axis=0))
newdf.columns = df.columns
newdf['date'] = ''
for i, r in newdf.iterrows():
print(r['WebSite'])
tag = 0
ni = 0
for j in web_name[::-1]:
if r['WebSite'] == j:
print(tag)
print(ni)
r['date'] = d_list[tag:]
ni += 1
tag += 1
newdf=newdf[['WebSite','Type','new_Traffic', 'date']]
newnew = newdf.rename(columns={'WebSite':'name','Type': 'type', 'new_Traffic':'value'})
newnew.to_csv('newdf.csv', index=0)
可视化分析
from pyecharts import options as opts
from pyecharts.globals import SymbolType,ThemeType
from pyecharts.charts import Grid, Liquid
from pyecharts.commons.utils import JsCode
排名前十榜单
y_data = df['new_Traffic'].values.tolist()[:10]
b = (Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts = opts.TitleOpts(),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=30)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
排名前二十榜单
y_data = df['new_Traffic'].values.tolist()[10:20]
b = (Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts = opts.TitleOpts(),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=30)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
国家排名
x_data = country_group.index.tolist()[:7]
y_data = country_group["Rank"].values.tolist()[:7]
b = (Bar()
.add_xaxis(x_data)
.add_yaxis('',y_data)
.set_global_opts(title_opts = opts.TitleOpts(),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='top'))
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Scatter()
.add_xaxis(x_data)
.add_yaxis("", y_data)
.set_global_opts(
title_opts=opts.TitleOpts(),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
visualmap_opts=opts.VisualMapOpts(type_="size", max_=30, min_=1),
)
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(c, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
网站类型
x_type = type_group.index.tolist()
y_type = type_group["Rank"].values.tolist()
test = ['Social Networks and Online Communities',
'Marketplace',
'News and Media',
'Search Engines',
'Adult',
'Programming and Developer Software',
'Email']
c = (
Polar()
.add_schema(angleaxis_opts=opts.AngleAxisOpts(data=x_type[:9], type_="category"))
.add("", y_type[:9], type_="bar", stack="stack0")
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(c, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
各类型网站所占比例
Liquid()
.add("", [y_type[2]/sum(y_type)], center=["35%", "75%"])
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
l2 = Liquid().add(
"lq",
[y_type[0]/sum(y_type)],
center=["25%", "26%"],
label_opts=opts.LabelOpts(
font_size=50,
formatter=JsCode(
"""function (param) {
return (Math.floor(param.value * 10000) / 100) + '%';
}"""
),
position="inside",
),
)
l3 = (
Liquid()
.add("", [y_type[1]/sum(y_type)], center=["75%", "26%"])
.set_global_opts(title_opts=opts.TitleOpts(title=""))
)
l4 = Liquid().add(
"",
[y_type[3]/sum(y_type)],
center=["65%", "75%"],
label_opts=opts.LabelOpts(
font_size=50,
formatter=JsCode(
"""function (param) {
return (Math.floor(param.value * 10000) / 100) + '%';
}"""
),
position="inside",is_show=True
),
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE)).add(l1, grid_opts=opts.GridOpts()).add(l2, grid_opts=opts.GridOpts()).add(l3, grid_opts=opts.GridOpts()).add(l4, grid_opts=opts.GridOpts())
grid.render_notebook()
动态排行展示
技术资讯
分享
点收藏
点点赞
点在看