玩转词云 | Brantou的日常

1 jieba中文分词

1.1 分词

import jieba

seg_list = jieba.cut(srcWd, cut_all=True)
print(u'Full mode:' + u'/ '.join(seg_list))

seg_list = jieba.cut(srcWd, cut_all=False)
print(u'Default mode:' + u'/ '.join(seg_list))

seg_list = jieba.cut_for_search(srcWd)
print(u', '.join(seg_list))

Full mode:做/ 最好/ 的/ Python/ 中文/ 分词/ 词组/ 组件
Default mode:做/ 最好/ 的/ Python/ 中文/ 分词/ 组件
做, 最好, 的, Python, 中文, 分词, 组件

Full mode:我/ 目前/ 在/ 学习/ Golang/ 和/ Python/
Default mode:我/ 目前/ 在/ 学习/ Golang/ 和/ Python/ 。
我, 目前, 在, 学习, Golang, 和, Python, 。

1.2 自定义词典

招聘信息收集了很多，但是关键字提取却不是很理想，我查看了jiaba的词典里，基本没有对英文专有名词的支持，这可苦煞了我，但是jieba支持自定义词典，我来生成英文的专有名词的词典好了。一行一行的加，难为我这个懒人了，但总有好心人，已经整理好了，试想这个名词是不是招聘网站已经整理好了，而且分门别类的，只需要把它们取回来就好了。

def get_html(url):
    request = urllib2.Request(url)
    request.add_header(
	'User-Agent',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
    )
    html = urllib2.urlopen(request)
    return html.read()

def get_soup(url):
    soup = BeautifulSoup(get_html(url), 'lxml')
    return soup

def fetch_lagou():
    words = []
    url = 'https://www.lagou.com/'
    soup = get_soup(url)
    category_list = soup.find_all('div', attrs={'class': 'menu_sub dn'})
    for category in category_list:
	dls = category.find_all('dl')
	for dl in dls:
	    names = dl.dd.find_all('a')
	    for name in names:
		words.append(name.text)
    return words


def fetch_zhipin():
    words = []
    url = 'http://www.zhipin.com/'
    soup = get_soup(url)
    job_menu = soup.find('div', attrs={'class': 'job-menu'})
    dls = job_menu.find_all('dl')
    for dl in dls:
	divs = dl.find_all('div', attrs={'class': 'text'})
	for div in divs:
	    names = div.find_all('a')
	    for name in names:
		words.append(name.text)
    return words

看来我思维受限了，招聘网站整理的信息都是高频信息，但是广度肯定是不够的，那怎么办呢。这个时候 stackoverflow 闪现在我的脑海中，没有那个网站里有关技术的广度，能和它匹敌了。

def fetch_stackoverflow():
  words = []
  for pageNo in range(1, 20):
      url = 'https://stackoverflow.com/tags?page=%d&tab=popular' % (pageNo)
      soup = get_soup(url)
      tags_list = soup.find('div', attrs={'id': 'tags_list'})
      trs = tags_list.table.find_all('tr')
      for tr in trs:
	  tds = tr.find_all('td')
	  for td in tds:
	      words.append(td.a.text)
  return words

1.3 调整词典

import jieba
import jieba.analyse

print(', '.join(
    jieba.analyse.extract_tags(srcWd, allowPOS=('n', 'nz', 'eng', 'vn'))))
jieba.load_userdict('../resource/userdict.txt')
print(', '.join(
    jieba.analyse.extract_tags(srcWd, allowPOS=('n', 'nz', 'eng', 'vn'))))

理解能力, 经验, 数据挖掘, 敏锐度, 平台, odps, hadoop, 数据仓库, hive, 建模, 业务, 海量, 能力, 数据, 协作, 优先, 团队, 模型, 计算机, 学科
理解能力, 经验, 数据挖掘, 敏锐度, 平台, odps, hadoop, 数据仓库, hive, 建模, 业务, 海量, 能力, 数据, 协作, 优先, 团队, 模型, 计算机, 学科

1.4 关键词提取

import jieba.analyse

print(', '.join(jieba.analyse.extract_tags(srcWd, allowPOS=())))
print(', '.join(
    jieba.analyse.extract_tags(srcWd, allowPOS=('n', 'nt', 'nz', 'eng', 'vn'
						))))
wordlst = jieba.analyse.extract_tags(
    srcWd, withWeight=True, allowPOS=('n', 'nt', 'nz', 'eng', 'vn'))

for word in wordlst:
    print(word[0] + ": " + str(word[1]))

import jieba.analyse

print(', '.join(jieba.analyse.textrank(srcWd)))
print(', '.join(
    jieba.analyse.textrank(srcWd, allowPOS=('n', 'nz', 'eng', 'vn'))))

1.5 词性标注

import jieba
import jieba.posseg as pseg

jieba.load_userdict('../resource/userdict.txt')
words = pseg.cut(srcWd)
for word, flag in words:
    print('%s %s' % (word, flag))

2 wordcloud词云

2.1 简单词云–美国宪法的词云

"""
Minimal Example
===============
Generating a square wordcloud from the US constitution using default arguments.
"""

from wordcloud import WordCloud

# Read the whole text.
text = open('../resource/constitution.txt').read()

# Generate a word cloud image
wordcloud = WordCloud(width=640, height=480).generate(text)
wordcloud.to_file('../images/constitution-n.png')

wordcloud = WordCloud(width=640, height=480, max_font_size=80).generate(text)
wordcloud.to_file('../images/constitution-s.png')

2.2 Colored by Group

from wordcloud import (WordCloud, get_single_color_func)
import matplotlib.pyplot as plt


class SimpleGroupedColorFunc(object):
    """Create a color function object which assigns EXACT colors
       to certain words based on the color to words mapping

       Parameters
       ----------
       color_to_words : dict(str -> list(str))
	 A dictionary that maps a color to the list of words.

       default_color : str
	 Color that will be assigned to a word that's not a member
	 of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
	self.word_to_color = {
	    word: color
	    for (color, words) in color_to_words.items() for word in words
	}

	self.default_color = default_color

    def __call__(self, word, **kwargs):
	return self.word_to_color.get(word, self.default_color)


class GroupedColorFunc(object):
    """Create a color function object which assigns DIFFERENT SHADES of
       specified colors to certain words based on the color to words mapping.

       Uses wordcloud.get_single_color_func

       Parameters
       ----------
       color_to_words : dict(str -> list(str))
	 A dictionary that maps a color to the list of words.

       default_color : str
	 Color that will be assigned to a word that's not a member
	 of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
	self.color_func_to_words = [(get_single_color_func(color), set(words))
				    for (color,
					 words) in color_to_words.items()]

	self.default_color_func = get_single_color_func(default_color)

    def get_color_func(self, word):
	"""Returns a single_color_func associated with the word"""
	try:
	    color_func = next(
		color_func for (color_func, words) in self.color_func_to_words
		if word in words)
	except StopIteration:
	    color_func = self.default_color_func

	return color_func

    def __call__(self, word, **kwargs):
	return self.get_color_func(word)(word, **kwargs)


text = """The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!"""

# Since the text is small collocations are turned off and text is lower-cased
wc = WordCloud(collocations=False).generate(text.lower())

color_to_words = {
    # words below will be colored with a green single color function
    '#00ff00': [
	'beautiful', 'explicit', 'simple', 'sparse', 'readability', 'rules',
	'practicality', 'explicitly', 'one', 'now', 'easy', 'obvious', 'better'
    ],
    # will be colored with a red single color function
    'red': [
	'ugly', 'implicit', 'complex', 'complicated', 'nested', 'dense',
	'special', 'errors', 'silently', 'ambiguity', 'guess', 'hard'
    ]
}

# Words that are not in any of the color_to_words values
# will be colored with a grey single color function
default_color = 'grey'

# Create a color function with single tone
# grouped_color_func = SimpleGroupedColorFunc(color_to_words, default_color)

# Create a color function with multiple tones
grouped_color_func = GroupedColorFunc(color_to_words, default_color)

# Apply our color function
wc.recolor(color_func=grouped_color_func)
wc.to_file('../images/grouped-color.png')

2.3 西游记的词云

import jieba.analyse
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
from PIL import Image
import random


def grey_color_func(word,
		    font_size,
		    position,
		    orientation,
		    random_state=None,
		    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


font_path = '../resource/tyzkaishu.ttf'
width = 640
height = 480

text = open('../resource/xiyouji.txt').read()
words = jieba.analyse.extract_tags(text, topK=200, withWeight=True)

word_freqs = {}
for word in words:
    word_freqs[word[0]] = word[1]

wordcloud = WordCloud(
    font_path=font_path, width=width,
    height=height).generate_from_frequencies(word_freqs)
wordcloud.to_file('../images/xiyouji.png')

mask = np.array(Image.open('../resource/stormtrooper_mask.png'))
wordcloud = WordCloud(
    font_path=font_path, width=width, height=height,
    mask=mask).generate_from_frequencies(word_freqs)
wordcloud.to_file('../images/xiyouji-mask.png')

# recolor wordcloud
wordcloud.recolor(color_func=grey_color_func, random_state=3)
wordcloud.to_file('../images/xiyouji-custom.png')

alice_coloring = np.array(Image.open('../resource/alice_color.png'))
wordcloud = WordCloud(
    font_path=font_path,
    width=width,
    height=height,
    background_color="white",
    mask=alice_coloring,
    max_font_size=80,
    random_state=42).generate_from_frequencies(word_freqs)

# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)

# recolor wordcloud
wordcloud.recolor(color_func=image_colors)
wordcloud.to_file('../images/xiyouji-color.png')

2.4 阿里招聘的词云

招聘信息是我使用爬虫趴下来的的，这里只做数据的分析。

# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
from wordcloud import WordCloud
import numpy as np
from PIL import Image
from pymongo import MongoClient

font_path = '../resource/zhengkaishu.ttf'
width = 640
height = 480

client = MongoClient('mongodb://localhost:27017/')
jobs = client.jobs.alibaba
text = ''
for job in jobs.find({u'firstCategory': u'技术类'}):
    if job.get('requirement'):
	text += job.get('requirement').replace('<br/>', ' ') + '\n'

jieba.load_userdict('../resource/userdict.txt')
words = jieba.analyse.extract_tags(text, topK=2000, withWeight=True)
word_freqs = {}
for word in words:
    _word = word[0].lower().capitalize()
    if _word not in word_freqs:
	word_freqs[_word] = word[1]
    else:
	word_freqs[_word] += word[1]

mask = np.array(Image.open('../resource/stormtrooper_mask.png'))
wordcloud = WordCloud(
    font_path=font_path, width=width, height=height,
    mask=mask).generate_from_frequencies(word_freqs)
wordcloud.to_file('../images/alibaba-mask.png')

Last Updated 2018-04-25 Wed 22:19.
Render by hexo-renderer-org with Emacs 25.3.2 (Org mode 8.2.10)