知乎笔试题

过滤html标签

假设在一个文本编辑器中允许使用富文本,但只允许使用以下html标签和属性,

<a href="" title=""> <abbr title=""> <acronym title=""> <b>
<blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
cite=""> <strike> <strong>  <pre>

编写一个过滤器,对任意输入的文本进行过滤,输出符合要求的富文本

代码如下:

#!/usr/bin/python env
#coding: utf8

from BeautifulSoup import BeautifulSoup

html = '<html><head><title>Page title</title></head><body>\
<p id="firstpara" align="center">This is paragraph <b>one</b>.\
<script type="text/javascript">"only test";</script>\
<a href="" title="" onclick="">test</a>\
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</html>'

# <a href="" title=""> <abbr title=""> <acronym title=""> <b>
# <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q
# cite=""> <strike> <strong>  <pre>

WHITE_LIST = {'a': {'href': '*', 'title': '*'}, 'abbr': {'title': '*'},
              'acronym': {'title': '*'}, 'b': {}, 'blockquote': {'cite': '*'},
              'cite': {}, 'code': {}, 'i': {}, 'del': {'datetime': '*'},
              'em': {}, 'q': {'cite': '*'}, 'strike': {}, 'strong': {},
              'pre': {}}


def parseHtml(html):
    soup = BeautifulSoup(html)
    for tag in soup.findAll(True):
        if tag.name not in WHITE_LIST:
            tag.hidden = True
        else:
            attr_rules = WHITE_LIST[tag.name]
            for attr_name, attr_value in tag.attrs:
                if attr_name not in attr_rules:
                    del tag[attr_name]

    return soup.renderContents()


if __name__ == '__main__':
    print parseHtml(html)

假设给你一个月的日志,格式如下:

[I 130403 17:26:40] 1 200 GET /question/123 (8.8.9.9) 200.39ms
[I 130403 17:26:90] 1 200 GET /topic/456 (8.8.9.9) 300.85ms
[I 130403 17:26:90] 1 200 POST /answer/789 (8.8.9.9) 300.85ms
...

方括号中依次是:级别,日期,时间,后面依次是用户id,返回码,访问方式,访问路径,用户ip,响应时间

日志文件名格式为:年-月-日-小时.log,如:2013-01-01-18.log,共30*24个文件。

写个程序,算出一个用户列表同时符合以下两个要求:

  1. 这些用户每天都会访问(GET)/topic/**这个路径至少两次(代表数字)
  2. 这些用户每天访问(GET)的/topic/*路径中,至少会包含两个不同的路径(后面的数字不一样)

再算出一个路径列表满足:
每天都被以上用户中至少两个用户访问

实现思路:

  • 先把一个月的30*24个文件名分天存储在一个filepaths_one_month列表 [[filepaths_one_day], …]
  • 然后算出每一天符合要求的用户访问的路径列表 user_dict_day {user_id: [path, …]} 和 符合要求的被访问路径的用户列表
    path_dict_day {path: [user_id, …]}
  • 然后把每天的user_dict_day的keys 求交集,求出每天都访问的用户,再用其为key, 找到这些用户访问的路径列表交集不为空的user_id, 即为要求1的答案user_list,接下来则对path_dict_day进行相同的操作,得出每天都被至少两个用户访问的路径列表,最后再用访问这些路径的用户列表与要求1求出的用户列表求交集,得出使交集不为空的path_list.

  • 需要说明一点,日志文件的年、月、日都在test()函数设置,然后在当前文件夹寻找日志文件,日志的小时范围为00点-23点(而不是01点-24点)。为了一些可读性,包含了一些较冗余的代码,不过代码不多,相信影响不大。

代码如下:

#!/usr/bin/env python
#coding: utf8

import re

log_keys = [
    'user_id',
    'return_code',
    'access_way',
    'access_path',
    'ip_address',
    'time'
]


class Find(object):
    def __init__(self, days, month, year):
        self.filepaths_one_month = self.file_paths(days, month, year)
        # for filepaths_day in self.filepaths_one_month:
        #     for filepath in filepaths_day:
        #         print filepath
        self.user_dict_month = {}       # {user_id: {path, ...}}
        self.path_dict_month = {}       # {path: {user_id, ...}}

    def find_one_month(self):
        user_dict_pre_day, path_dict_pre_day = self.find_one_day(self.filepaths_one_month[0])
        for filepaths_one_day in self.filepaths_one_month[1:]:
            user_dict_day, path_dict_day = self.find_one_day(filepaths_one_day)
            user_set = set(user_dict_day.keys()) & set(user_dict_pre_day.keys())
            for user_id in user_set:
                path_set = set(user_dict_day[user_id]) & set(user_dict_pre_day[user_id])
                if len(path_set) > 0:
                    self.user_dict_month[user_id] = path_set
            user_dict_pre_day = self.user_dict_month

            path_set = set(path_dict_day.keys()) & set(path_dict_pre_day.keys())
            for path in path_set:
                user_set = set(path_dict_day[path]) & set(path_dict_pre_day[path])
                self.path_dict_month[path] = user_set
            path_dict_pre_day = self.path_dict_month

        user_list = self.user_dict_month.keys()
        path_list = []

        for path in self.path_dict_month.keys():
            if len(self.path_dict_month[path] & set(user_list)) > 0:
                path_list.append(path)

        return user_list, path_list

    def file_paths(self, days, month, year):
        filepaths = []
        for day in xrange(1, days+1):
            filepaths_day = []
            for hour in xrange(0, 24):
                temppath = "%2d-%2d-%2d-%2d.log" % (year, month, day, hour)
                filepaths_day.append(temppath.replace(' ', '0'))
            filepaths.append(filepaths_day)
        return filepaths

    def find_one_day(self, filepaths):
        user_dict = {}          # {user_id: {path: times}}
        user_dict_day = {}      # {user_id: [path, ...]}
        path_dict = {}          # {path: {user_id: times}}
        path_dict_day = {}      # {path: [user_id, ...]}

        for filepath in filepaths:
            for line in file(filepath).readlines():
                if not self.is_get_and_topic(line):
                    continue
                log_dict = self.parser_line(line)

                user_id = log_dict['user_id']
                path = log_dict['access_path']
                if user_id not in user_dict:
                    user_dict[user_id] = {}
                if path not in user_dict[user_id]:
                    user_dict[user_id][path] = 1
                else:
                    user_dict[user_id][path] = user_dict[user_id][path] + 1

                if path not in path_dict:
                    path_dict[path] = {}
                if user_id not in path_dict[path]:
                    path_dict[path][user_id] = 1
                else:
                    path_dict[path][user_id] = path_dict[path][user_id] + 1

        for user_id in user_dict.keys():
            for path in user_dict[user_id].keys():
                if user_dict[user_id][path] >= 2 and len(user_dict[user_id].keys()) >= 2:
                    if user_id not in user_dict_day:
                        user_dict_day[user_id] = []
                    user_dict_day[user_id].append(path)
        # print 'user_dict_day:', user_dict_day

        for path in path_dict.keys():
            for user_id in path_dict[path].keys():
                if path_dict[path][user_id] >= 2:
                    if path not in path_dict_day:
                        path_dict_day[path] = []
                    path_dict_day[path].append(user_id)
        # print 'path_dict_day:', path_dict_day

        return user_dict_day, path_dict_day

    def parser_line(self, str):
        log_list = str[(str.find(']')+2):].split(' ')
        log_dict = {}
        for index in range(0, len(log_list)):
            log_dict[log_keys[index]] = log_list[index]
        # for key, value in log_dict.items():
        #     print key, value
        return log_dict

    def is_get_and_topic(self, str):
        if re.search('GET /topic/[0-9]+', str):
            return True
        return False


def test():
    days = 30        # 一个月的天数
    year = 2013
    month = 1
    test = Find(days, month, year)
    user_list, path_list = test.find_one_month()

    print 'user_list:', user_list
    print 'path_list:', path_list

if __name__ == '__main__':
    test()

功能支持

  • 简单四则运算
  • 优先级支持
  • 小括号支持

简单演示

$ ./caculator
Please input express:
1+2*1-12+2-(1+2*3)
[1 + 2 * 1 - 12 + 2 - ( 1 + 2 * 3 )]      //中缀表达式
[1 2 1 * + 12 - 2 + 1 2 3 * + -]            //后缀表达式
-14                                        //结果

实现流程

caculate.png

后缀到前缀的转换

func pre2stuf(exps []string) (exps2 []string) {
    list1 := list.New()
    list2 := list.New()

    for _, exp := range exps {
        if isOperate(exp) {
            if op, ok := isPop(list1, exp); ok {
                for _, s := range op {
                    list2.PushBack(s)
                }
            }
            if exp == ")" {
                continue
            }
            list1.PushBack(exp)
        } else {
            list2.PushBack(exp)
        }
    }

    for cur := list1.Back(); cur != nil; cur = cur.Prev() {
        list2.PushBack(cur.Value)
    }

    for cur := list2.Front(); cur != nil; cur = cur.Next() {
        if curValue, ok := cur.Value.(string); ok {
            exps2 = append(exps2, curValue)
        }
    }
    return
}

代码在这里

程序功能参数

Usage of ./gscan:
  -h="help": help doc
  -ip="127.0.0.1": IP range of scan. Example:
        192.168.1.1
        192.168.1.1, 192.168.1.5
        192.168.1.1-192.168.1.100
  -p="1-1024": Port range of scan. Example:
        135
        135, 445, 3389
        1-1024
  -w="connect": Way of scan. Expample
        connect
        syn
        fin

实现思想

一个端口一个goroutine,用一个chanel接收所有goroutine返回的扫描结果

代码在这里