figure_5.png

先看下效果图吧,首先先用爬虫获取豆瓣上面对《美女与野兽》的评论,然后通过jieba分词,最后根据词频生成最后的图片,最后一步可以直接用wordcloud这个模块来生成。

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import requests
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def spider(name,txt,page):#name:电影代号(找到该电影在豆瓣中的url,上面会有一串数字就是代号) txt:保存评论的txt文件名字 page:读取评论的页数
    start = 0
    cookie = {
        'Cookie': ''}#这里提交下自己的cookie,不登陆的话容易被豆瓣拒绝访问
    f = open(txt, 'w')
    for i in range(0,page):
        url = 'https://movie.douban.com/subject/%s/reviews?start=%d' %(name,start)
        print url
        html = requests.get(url, cookies=cookie)
        pro = etree.HTML(html.text)
        p = pro.xpath('//*[@class="review-short"]/div/text()')
        for c in p:
            f.write(c)
        start += 20
    f.close()
def pic(txt):
    text = open(txt).read()
    t = jieba.cut(text)
    t = ''.join(t)
    wordcloud = WordCloud(max_words=200, background_color='black',         font_path='/Library/Fonts/Hanzipen.ttc', width=1400,#font_path属性的值是电脑文字字体文件的路径,本路径仅适合os x,默认值只能显示英文
                      height=900).generate(t)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
if __name__=='__main__':
    txt='demo.txt'
    spider('25900945',txt,90)
    pic(txt)