Python爬取上海天气且可视化显示

一、安装Anaconda3环境和Pycharm

建议使用Anaconda3环境运行代码，因为Anaconda3中已经预装好了需要的大多数包，不用重复安装，使用Pycharm可以方便的使用Anaconda3环境，不会和电脑中已有的Python2环境弄串。

需要注意的就是Anaconda3到清华的镜像下载，不要到官网下载，官网的下载速度太慢了还时不时断线。

二、Pycharm使用Anaconda3环境

Step1：打开Pycharm点击创建文件，选择文件存储路径并且创建。
File-Settings进行项目设置，接着点击 project interpreter 的右边的小齿轮，选择 add local ，选择anaconda文件路径下的python.exe。接着pycharm会更新解释器，导入模块等，要稍等一点时间。
如何验证有没有配置好的，只要你运行代码，看一下输出的第一行的指令使用的是不是Anaconda3的Python，如果是的话说明配置好了。

三、使用爬虫爬取天气信息

import requests # 用来抓取网页的HTML源代码
import csv # 将数据写入到csv文件中
import random # 取随机数
import time # 时间相关操作
import socket # 在这里只用于异常处理
import http.client # 这里只用于异常处理
from bs4 import BeautifulSoup # 可以方便得从HTML或XML中提取数据的Python库


# 获取网页中的HTML代码
def get_content(url , data = None):
    # header是requests.get的一个参数，目的是模拟浏览器访问
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'www.weather.com.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
    }
    # timeout是设定的一个超时时间，取随机数是因为防止被网站认定为网络爬虫。
    timeout = random.choice(range(80, 180))
    while True:
        try:
            rep = requests.get(url,headers = header,timeout = timeout) # 然后通过requests.get方法获取网页的源代码
            rep.encoding = 'utf-8' # 将源代码的编码格式改为utf-8（不该源代码中中文部分会为乱码）
            break
        # 一些异常处理
        except socket.timeout as e:
            print( '3:', e)
            time.sleep(random.choice(range(8,15)))
        except socket.error as e:
            print( '4:', e)
            time.sleep(random.choice(range(20, 60)))
        except http.client.BadStatusLine as e:
            print( '5:', e)
            time.sleep(random.choice(range(30, 80)))
        except http.client.IncompleteRead as e:
            print( '6:', e)
            time.sleep(random.choice(range(5, 15)))
    # 返回网页的源代码
    return rep.text

def get_data(html_text):
    final = []
    bs = BeautifulSoup(html_text, "html.parser")  # 创建BeautifulSoup对象
    body = bs.body # 获取body部分
    data = body.find('div', {'id': '15d'})  # 找到id为15d的div
    ul = data.find('ul')  # 获取ul部分
    li = ul.find_all('li')  # 获取所有的li

    for day in li: # 对每个li标签中的内容进行遍历
        temp = []
        date = day.find('span',class_="time").string  # 找到日期
        temp.append(date)  # 添加到temp中
        wea = day.find('span',class_="wea").string  # 找到天气状况
        temp.append(wea)  # 添加到temp中
        inf = day.find('span',class_="tem")  # 找到li中的所有tem的span标签，这里面含有两种节点，一个是最高温一个是最低温
        temperature_index=0 #最高温和最低温标志初始化为0
        for infstr in inf.strings:
            if temperature_index==0:#标志为0时读到的是最高温度,为1时是最低气温
                temperature_highest = infstr.replace('℃', '') #最高温度后面有个℃，去掉这个符号
                temp.append(temperature_highest)  # 将最高温添加到temp中
            else:
                temperature_lowest = infstr.replace('℃', '').replace('/', '') # 最低温度后面有个℃和/，去掉这个符号
                temp.append(temperature_lowest)  # 将最低温添加到temp中
            temperature_index = 1  # 最高温和最低温标志置为1

        wind = day.find('span', class_="wind").string  # 找到风向
        temp.append(wind)  # 添加到temp中
        wind1 = day.find('span', class_="wind1").string  # 找到风级
        temp.append(wind1)  # 添加到temp中
        final.append(temp)   #将temp加到final中
    return final

def write_data(data, name):
    file_name = name
    with open(file_name, 'w', errors='ignore', newline='') as f: # 打开时模式设为a的话是添加到原有数据后
            f_csv = csv.writer(f)
            f_csv.writerows(data)

if __name__ == '__main__':
    url ='http://www.weather.com.cn/weather15d/101020100.shtml'
    html = get_content(url)
    result = get_data(html)
    write_data(result, 'weather2.csv')

import requests # 用来抓取网页的HTML源代码

import csv # 将数据写入到csv文件中

import random # 取随机数

import time # 时间相关操作

import socket # 在这里只用于异常处理

import http.client # 这里只用于异常处理

from bs4 import BeautifulSoup # 可以方便得从HTML或XML中提取数据的Python库

# 获取网页中的HTML代码

def get_content(url , data = None):

# header是requests.get的一个参数，目的是模拟浏览器访问

header={

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate',

'Accept-Language': 'zh-CN,zh;q=0.8',

'Connection': 'keep-alive',

'Host': 'www.weather.com.cn',

'Upgrade-Insecure-Requests': '1',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'

}

# timeout是设定的一个超时时间，取随机数是因为防止被网站认定为网络爬虫。

timeout = random.choice(range(80, 180))

while True:

try:

rep = requests.get(url,headers = header,timeout = timeout) # 然后通过requests.get方法获取网页的源代码

rep.encoding = 'utf-8' # 将源代码的编码格式改为utf-8（不该源代码中中文部分会为乱码）

break

# 一些异常处理

except socket.timeout as e:

print( '3:', e)

time.sleep(random.choice(range(8,15)))

except socket.error as e:

print( '4:', e)

time.sleep(random.choice(range(20, 60)))

except http.client.BadStatusLine as e:

print( '5:', e)

time.sleep(random.choice(range(30, 80)))

except http.client.IncompleteRead as e:

print( '6:', e)

time.sleep(random.choice(range(5, 15)))

# 返回网页的源代码

return rep.text

def get_data(html_text):

final = []

bs = BeautifulSoup(html_text, "html.parser") # 创建BeautifulSoup对象

body = bs.body # 获取body部分

data = body.find('div', {'id': '15d'}) # 找到id为15d的div

ul = data.find('ul') # 获取ul部分

li = ul.find_all('li') # 获取所有的li

for day in li: # 对每个li标签中的内容进行遍历

temp = []

date = day.find('span',class_="time").string # 找到日期

temp.append(date) # 添加到temp中

wea = day.find('span',class_="wea").string # 找到天气状况

temp.append(wea) # 添加到temp中

inf = day.find('span',class_="tem") # 找到li中的所有tem的span标签，这里面含有两种节点，一个是最高温一个是最低温

temperature_index=0 #最高温和最低温标志初始化为0

for infstr in inf.strings:

if temperature_index==0:#标志为0时读到的是最高温度,为1时是最低气温

temperature_highest = infstr.replace('℃', '') #最高温度后面有个℃，去掉这个符号

temp.append(temperature_highest) # 将最高温添加到temp中

else:

temperature_lowest = infstr.replace('℃', '').replace('/', '') # 最低温度后面有个℃和/，去掉这个符号

temp.append(temperature_lowest) # 将最低温添加到temp中

temperature_index = 1 # 最高温和最低温标志置为1

wind = day.find('span', class_="wind").string # 找到风向

temp.append(wind) # 添加到temp中

wind1 = day.find('span', class_="wind1").string # 找到风级

temp.append(wind1) # 添加到temp中

final.append(temp) #将temp加到final中

return final

def write_data(data, name):

file_name = name

with open(file_name, 'w', errors='ignore', newline='') as f: # 打开时模式设为a的话是添加到原有数据后

f_csv = csv.writer(f)

f_csv.writerows(data)

if __name__ == '__main__':

url ='http://www.weather.com.cn/weather15d/101020100.shtml'

html = get_content(url)

result = get_data(html)

write_data(result, 'weather2.csv')

代码中都有详细的注释，其中的header是使用Chrome打开该网页，按F12进入控制栏，选择Network可以查看网络请求，然后刷新一下看一下第一条请求的header即可，整体的原理就是下载天气的HTML然后进行解析，最后写到一个CSV文件中。
如何用BeautifulSoup 库来解析HTML网页，它的文档可以参考这个地址。

四、数据可视化显示

#coding:utf-8
import csv # 导入csv库
import matplotlib.pyplot as plt # 导入绘图库
plt.style.use('ggplot') # 使用ggplot绘图样式
plt.rcParams['font.sans-serif']=['SimHei'] # 设置中文字体兼容
plt.rcParams['axes.unicode_minus']=False # 设置中文字体兼容

input_file = 'weather2.csv' # 读入weather2的文件

with open(input_file,'r',newline='') as csv_in_file: # 以只读模式打开csv文件
    filereader = csv.reader(csv_in_file) # 读取文件内的内容
    temphigh = [] # 存储最高气温的数组定义
    templow = [] # 存储最低气温的数组定义
    dateinfo = [] # 存储日期信息的数组定义
    for row_list in filereader: # 遍历CSV文件的每一行
        temphigh.append(row_list[2]) # 写入最高气温信息
        templow.append(row_list[3]) # 写入最低气温信息
        dateinfo.append(row_list[0]) # 写入日期信息

fig = plt.figure() # 新建绘图区域
ax1 = fig.add_subplot(1,1,1) # 设置绘图区域为1行一列，位置为区域1
ax1.plot(templow, marker=r'o', color=u'blue', linestyle='-', label=u'最低温度') # 绘制最低气温
ax1.plot(temphigh, marker=r'+', color=u'red', linestyle='--', label=u'最高温度') # 绘制最高气温
ax1.xaxis.set_ticks_position('bottom') # 设置图像位置靠下
ax1.yaxis.set_ticks_position('left') # 设置图像位置靠左
ax1.set_title(u'上海市未来8-15日气温分布图') # 设置图像标题
plt.xticks(range(len(dateinfo)),dateinfo,rotation=45,fontsize=10) # 设置横坐标的字以及旋转角度，如果不要用字可以省略
plt.xlabel(u'时间') # 设置横坐标标题
plt.ylabel(u'温度') # 设置纵坐标标题
plt.legend(loc='best')

plt.savefig('line_plot.png', dpi=400, bbox_inches='tight') # 存储图像到本地
plt.show() # 显示图像窗口

#coding:utf-8

import csv # 导入csv库

import matplotlib.pyplot as plt # 导入绘图库

plt.style.use('ggplot') # 使用ggplot绘图样式

plt.rcParams['font.sans-serif']=['SimHei'] # 设置中文字体兼容

plt.rcParams['axes.unicode_minus']=False # 设置中文字体兼容

input_file = 'weather2.csv' # 读入weather2的文件

with open(input_file,'r',newline='') as csv_in_file: # 以只读模式打开csv文件

filereader = csv.reader(csv_in_file) # 读取文件内的内容

temphigh = [] # 存储最高气温的数组定义

templow = [] # 存储最低气温的数组定义

dateinfo = [] # 存储日期信息的数组定义

for row_list in filereader: # 遍历CSV文件的每一行

temphigh.append(row_list[2]) # 写入最高气温信息

templow.append(row_list[3]) # 写入最低气温信息

dateinfo.append(row_list[0]) # 写入日期信息

fig = plt.figure() # 新建绘图区域

ax1 = fig.add_subplot(1,1,1) # 设置绘图区域为1行一列，位置为区域1

ax1.plot(templow, marker=r'o', color=u'blue', linestyle='-', label=u'最低温度') # 绘制最低气温

ax1.plot(temphigh, marker=r'+', color=u'red', linestyle='--', label=u'最高温度') # 绘制最高气温

ax1.xaxis.set_ticks_position('bottom') # 设置图像位置靠下

ax1.yaxis.set_ticks_position('left') # 设置图像位置靠左

ax1.set_title(u'上海市未来8-15日气温分布图') # 设置图像标题

plt.xticks(range(len(dateinfo)),dateinfo,rotation=45,fontsize=10) # 设置横坐标的字以及旋转角度，如果不要用字可以省略

plt.xlabel(u'时间') # 设置横坐标标题

plt.ylabel(u'温度') # 设置纵坐标标题

plt.legend(loc='best')

plt.savefig('line_plot.png', dpi=400, bbox_inches='tight') # 存储图像到本地

plt.show() # 显示图像窗口

代码中也有注释，所以不再多说。

Blog of Dada

Took the sourest lemon that life has to offer and turned it into something resembling lemonade.

Python爬取上海天气且可视化显示

一、安装Anaconda3环境和Pycharm

二、Pycharm使用Anaconda3环境

三、使用爬虫爬取天气信息

四、数据可视化显示