知乎笔试题

过滤html标签

假设在一个文本编辑器中允许使用富文本，但只允许使用以下html标签和属性，

<a href="" title=""> <abbr title=""> <acronym title=""> <b>
<blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
cite=""> <strike> <strong>  <pre>

编写一个过滤器，对任意输入的文本进行过滤，输出符合要求的富文本

代码如下：

#!/usr/bin/python env
#coding: utf8

from BeautifulSoup import BeautifulSoup

html = '<html><head><title>Page title</title></head><body>\
<p id="firstpara" align="center">This is paragraph <b>one</b>.\
<script type="text/javascript">"only test";</script>\
<a href="" title="" onclick="">test</a>\
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</html>'

# <a href="" title=""> <abbr title=""> <acronym title=""> <b>
# <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
# cite=""> <strike> <strong>  <pre>

WHITE_LIST = {'a': {'href': '*', 'title': '*'}, 'abbr': {'title': '*'},
              'acronym': {'title': '*'}, 'b': {}, 'blockquote': {'cite': '*'},
              'cite': {}, 'code': {}, 'i': {}, 'del': {'datetime': '*'},
              'em': {}, 'q': {'cite': '*'}, 'strike': {}, 'strong': {},
              'pre': {}}


def parseHtml(html):
    soup = BeautifulSoup(html)
    for tag in soup.findAll(True):
        if tag.name not in WHITE_LIST:
            tag.hidden = True
        else:
            attr_rules = WHITE_LIST[tag.name]
            for attr_name, attr_value in tag.attrs:
                if attr_name not in attr_rules:
                    del tag[attr_name]

    return soup.renderContents()


if __name__ == '__main__':
    print parseHtml(html)

假设给你一个月的日志，格式如下：

[I 130403 17:26:40] 1 200 GET /question/123 (8.8.9.9) 200.39ms
[I 130403 17:26:90] 1 200 GET /topic/456 (8.8.9.9) 300.85ms
[I 130403 17:26:90] 1 200 POST /answer/789 (8.8.9.9) 300.85ms
...

方括号中依次是：级别，日期，时间，后面依次是用户id，返回码，访问方式，访问路径，用户ip，响应时间

日志文件名格式为：年-月-日-小时.log，如：2013-01-01-18.log，共30*24个文件。

写个程序，算出一个用户列表同时符合以下两个要求：

这些用户每天都会访问（GET）/topic/**这个路径至少两次（代表数字）
这些用户每天访问（GET）的/topic/*路径中，至少会包含两个不同的路径（后面的数字不一样）

再算出一个路径列表满足:
每天都被以上用户中至少两个用户访问

实现思路：

先把一个月的30*24个文件名分天存储在一个filepaths_one_month列表 [[filepaths_one_day], …]
然后算出每一天符合要求的用户访问的路径列表 user_dict_day {user_id: [path, …]} 和符合要求的被访问路径的用户列表
path_dict_day {path: [user_id, …]}
然后把每天的user_dict_day的keys 求交集，求出每天都访问的用户，再用其为key, 找到这些用户访问的路径列表交集不为空的user_id, 即为要求1的答案user_list，接下来则对path_dict_day进行相同的操作，得出每天都被至少两个用户访问的路径列表，最后再用访问这些路径的用户列表与要求1求出的用户列表求交集，得出使交集不为空的path_list.
需要说明一点，日志文件的年、月、日都在test()函数设置，然后在当前文件夹寻找日志文件，日志的小时范围为00点-23点(而不是01点-24点)。为了一些可读性，包含了一些较冗余的代码，不过代码不多，相信影响不大。

代码如下：

#!/usr/bin/env python
#coding: utf8

import re

log_keys = [
    'user_id',
    'return_code',
    'access_way',
    'access_path',
    'ip_address',
    'time'
]


class Find(object):
    def __init__(self, days, month, year):
        self.filepaths_one_month = self.file_paths(days, month, year)
        # for filepaths_day in self.filepaths_one_month:
        #     for filepath in filepaths_day:
        #         print filepath
        self.user_dict_month = {}       # {user_id: {path, ...}}
        self.path_dict_month = {}       # {path: {user_id, ...}}

    def find_one_month(self):
        user_dict_pre_day, path_dict_pre_day = self.find_one_day(self.filepaths_one_month[0])
        for filepaths_one_day in self.filepaths_one_month[1:]:
            user_dict_day, path_dict_day = self.find_one_day(filepaths_one_day)
            user_set = set(user_dict_day.keys()) & set(user_dict_pre_day.keys())
            for user_id in user_set:
                path_set = set(user_dict_day[user_id]) & set(user_dict_pre_day[user_id])
                if len(path_set) > 0:
                    self.user_dict_month[user_id] = path_set
            user_dict_pre_day = self.user_dict_month

            path_set = set(path_dict_day.keys()) & set(path_dict_pre_day.keys())
            for path in path_set:
                user_set = set(path_dict_day[path]) & set(path_dict_pre_day[path])
                self.path_dict_month[path] = user_set
            path_dict_pre_day = self.path_dict_month

        user_list = self.user_dict_month.keys()
        path_list = []

        for path in self.path_dict_month.keys():
            if len(self.path_dict_month[path] & set(user_list)) > 0:
                path_list.append(path)

        return user_list, path_list

    def file_paths(self, days, month, year):
        filepaths = []
        for day in xrange(1, days+1):
            filepaths_day = []
            for hour in xrange(0, 24):
                temppath = "%2d-%2d-%2d-%2d.log" % (year, month, day, hour)
                filepaths_day.append(temppath.replace(' ', '0'))
            filepaths.append(filepaths_day)
        return filepaths

    def find_one_day(self, filepaths):
        user_dict = {}          # {user_id: {path: times}}
        user_dict_day = {}      # {user_id: [path, ...]}
        path_dict = {}          # {path: {user_id: times}}
        path_dict_day = {}      # {path: [user_id, ...]}

        for filepath in filepaths:
            for line in file(filepath).readlines():
                if not self.is_get_and_topic(line):
                    continue
                log_dict = self.parser_line(line)

                user_id = log_dict['user_id']
                path = log_dict['access_path']
                if user_id not in user_dict:
                    user_dict[user_id] = {}
                if path not in user_dict[user_id]:
                    user_dict[user_id][path] = 1
                else:
                    user_dict[user_id][path] = user_dict[user_id][path] + 1

                if path not in path_dict:
                    path_dict[path] = {}
                if user_id not in path_dict[path]:
                    path_dict[path][user_id] = 1
                else:
                    path_dict[path][user_id] = path_dict[path][user_id] + 1

        for user_id in user_dict.keys():
            for path in user_dict[user_id].keys():
                if user_dict[user_id][path] >= 2 and len(user_dict[user_id].keys()) >= 2:
                    if user_id not in user_dict_day:
                        user_dict_day[user_id] = []
                    user_dict_day[user_id].append(path)
        # print 'user_dict_day:', user_dict_day

        for path in path_dict.keys():
            for user_id in path_dict[path].keys():
                if path_dict[path][user_id] >= 2:
                    if path not in path_dict_day:
                        path_dict_day[path] = []
                    path_dict_day[path].append(user_id)
        # print 'path_dict_day:', path_dict_day

        return user_dict_day, path_dict_day

    def parser_line(self, str):
        log_list = str[(str.find(']')+2):].split(' ')
        log_dict = {}
        for index in range(0, len(log_list)):
            log_dict[log_keys[index]] = log_list[index]
        # for key, value in log_dict.items():
        #     print key, value
        return log_dict

    def is_get_and_topic(self, str):
        if re.search('GET /topic/[0-9]+', str):
            return True
        return False


def test():
    days = 30        # 一个月的天数
    year = 2013
    month = 1
    test = Find(days, month, year)
    user_list, path_list = test.find_one_month()

    print 'user_list:', user_list
    print 'path_list:', path_list

if __name__ == '__main__':
    test()