知乎笔试题
过滤html标签
假设在一个文本编辑器中允许使用富文本,但只允许使用以下html标签和属性,
<a href="" title=""> <abbr title=""> <acronym title=""> <b>
<blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
cite=""> <strike> <strong> <pre>
编写一个过滤器,对任意输入的文本进行过滤,输出符合要求的富文本
代码如下:
#!/usr/bin/python env
#coding: utf8
from BeautifulSoup import BeautifulSoup
html = '<html><head><title>Page title</title></head><body>\
<p id="firstpara" align="center">This is paragraph <b>one</b>.\
<script type="text/javascript">"only test";</script>\
<a href="" title="" onclick="">test</a>\
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</html>'
# <a href="" title=""> <abbr title=""> <acronym title=""> <b>
# <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
# cite=""> <strike> <strong> <pre>
WHITE_LIST = {'a': {'href': '*', 'title': '*'}, 'abbr': {'title': '*'},
'acronym': {'title': '*'}, 'b': {}, 'blockquote': {'cite': '*'},
'cite': {}, 'code': {}, 'i': {}, 'del': {'datetime': '*'},
'em': {}, 'q': {'cite': '*'}, 'strike': {}, 'strong': {},
'pre': {}}
def parseHtml(html):
soup = BeautifulSoup(html)
for tag in soup.findAll(True):
if tag.name not in WHITE_LIST:
tag.hidden = True
else:
attr_rules = WHITE_LIST[tag.name]
for attr_name, attr_value in tag.attrs:
if attr_name not in attr_rules:
del tag[attr_name]
return soup.renderContents()
if __name__ == '__main__':
print parseHtml(html)
假设给你一个月的日志,格式如下:
[I 130403 17:26:40] 1 200 GET /question/123 (8.8.9.9) 200.39ms
[I 130403 17:26:90] 1 200 GET /topic/456 (8.8.9.9) 300.85ms
[I 130403 17:26:90] 1 200 POST /answer/789 (8.8.9.9) 300.85ms
...
方括号中依次是:级别,日期,时间,后面依次是用户id,返回码,访问方式,访问路径,用户ip,响应时间
日志文件名格式为:年-月-日-小时.log,如:2013-01-01-18.log,共30*24个文件。
写个程序,算出一个用户列表同时符合以下两个要求:
- 这些用户每天都会访问(GET)/topic/**这个路径至少两次(代表数字)
- 这些用户每天访问(GET)的/topic/*路径中,至少会包含两个不同的路径(后面的数字不一样)
再算出一个路径列表满足:
每天都被以上用户中至少两个用户访问
实现思路:
- 先把一个月的30*24个文件名分天存储在一个filepaths_one_month列表 [[filepaths_one_day], …]
- 然后算出每一天符合要求的用户访问的路径列表 user_dict_day {user_id: [path, …]} 和 符合要求的被访问路径的用户列表
path_dict_day {path: [user_id, …]} 然后把每天的user_dict_day的keys 求交集,求出每天都访问的用户,再用其为key, 找到这些用户访问的路径列表交集不为空的user_id, 即为要求1的答案user_list,接下来则对path_dict_day进行相同的操作,得出每天都被至少两个用户访问的路径列表,最后再用访问这些路径的用户列表与要求1求出的用户列表求交集,得出使交集不为空的path_list.
需要说明一点,日志文件的年、月、日都在test()函数设置,然后在当前文件夹寻找日志文件,日志的小时范围为00点-23点(而不是01点-24点)。为了一些可读性,包含了一些较冗余的代码,不过代码不多,相信影响不大。
代码如下:
#!/usr/bin/env python
#coding: utf8
import re
log_keys = [
'user_id',
'return_code',
'access_way',
'access_path',
'ip_address',
'time'
]
class Find(object):
def __init__(self, days, month, year):
self.filepaths_one_month = self.file_paths(days, month, year)
# for filepaths_day in self.filepaths_one_month:
# for filepath in filepaths_day:
# print filepath
self.user_dict_month = {} # {user_id: {path, ...}}
self.path_dict_month = {} # {path: {user_id, ...}}
def find_one_month(self):
user_dict_pre_day, path_dict_pre_day = self.find_one_day(self.filepaths_one_month[0])
for filepaths_one_day in self.filepaths_one_month[1:]:
user_dict_day, path_dict_day = self.find_one_day(filepaths_one_day)
user_set = set(user_dict_day.keys()) & set(user_dict_pre_day.keys())
for user_id in user_set:
path_set = set(user_dict_day[user_id]) & set(user_dict_pre_day[user_id])
if len(path_set) > 0:
self.user_dict_month[user_id] = path_set
user_dict_pre_day = self.user_dict_month
path_set = set(path_dict_day.keys()) & set(path_dict_pre_day.keys())
for path in path_set:
user_set = set(path_dict_day[path]) & set(path_dict_pre_day[path])
self.path_dict_month[path] = user_set
path_dict_pre_day = self.path_dict_month
user_list = self.user_dict_month.keys()
path_list = []
for path in self.path_dict_month.keys():
if len(self.path_dict_month[path] & set(user_list)) > 0:
path_list.append(path)
return user_list, path_list
def file_paths(self, days, month, year):
filepaths = []
for day in xrange(1, days+1):
filepaths_day = []
for hour in xrange(0, 24):
temppath = "%2d-%2d-%2d-%2d.log" % (year, month, day, hour)
filepaths_day.append(temppath.replace(' ', '0'))
filepaths.append(filepaths_day)
return filepaths
def find_one_day(self, filepaths):
user_dict = {} # {user_id: {path: times}}
user_dict_day = {} # {user_id: [path, ...]}
path_dict = {} # {path: {user_id: times}}
path_dict_day = {} # {path: [user_id, ...]}
for filepath in filepaths:
for line in file(filepath).readlines():
if not self.is_get_and_topic(line):
continue
log_dict = self.parser_line(line)
user_id = log_dict['user_id']
path = log_dict['access_path']
if user_id not in user_dict:
user_dict[user_id] = {}
if path not in user_dict[user_id]:
user_dict[user_id][path] = 1
else:
user_dict[user_id][path] = user_dict[user_id][path] + 1
if path not in path_dict:
path_dict[path] = {}
if user_id not in path_dict[path]:
path_dict[path][user_id] = 1
else:
path_dict[path][user_id] = path_dict[path][user_id] + 1
for user_id in user_dict.keys():
for path in user_dict[user_id].keys():
if user_dict[user_id][path] >= 2 and len(user_dict[user_id].keys()) >= 2:
if user_id not in user_dict_day:
user_dict_day[user_id] = []
user_dict_day[user_id].append(path)
# print 'user_dict_day:', user_dict_day
for path in path_dict.keys():
for user_id in path_dict[path].keys():
if path_dict[path][user_id] >= 2:
if path not in path_dict_day:
path_dict_day[path] = []
path_dict_day[path].append(user_id)
# print 'path_dict_day:', path_dict_day
return user_dict_day, path_dict_day
def parser_line(self, str):
log_list = str[(str.find(']')+2):].split(' ')
log_dict = {}
for index in range(0, len(log_list)):
log_dict[log_keys[index]] = log_list[index]
# for key, value in log_dict.items():
# print key, value
return log_dict
def is_get_and_topic(self, str):
if re.search('GET /topic/[0-9]+', str):
return True
return False
def test():
days = 30 # 一个月的天数
year = 2013
month = 1
test = Find(days, month, year)
user_list, path_list = test.find_one_month()
print 'user_list:', user_list
print 'path_list:', path_list
if __name__ == '__main__':
test()