數據結構和算法
通過*表達式或者是一個用不到的變量來拆分,處理序列
>>> data = ['a', 'v', 'x']
>>> a, _, x = data
>>> a
'a'
>>> _
'v'
>>> x
'x'
>>>
def drop_first_last(grads):
'''
*號去除變量
*號解壓出的變量永遠是列表
'''
first, *middle, last = grads
return middle
def split_star():
'''
*號切分串
'''
line = 'nobody:*:-2:-2:Unprivileged User:/var/empty:/usr/bin/false'
uname, *fiedles, homedir, sh = line.split(':')
print (uname)
print (homedir)
print (sh)
print (fiedles)
def sum(items):
'''
利用*分解操作實現某個數組的遞歸加
'''
head, *tail = items
return head + sum(tail) if tail else head
使用collenctions的dque可以構造一個隊列
- 設置隊列的長度,append滿后可以自動移除最先入隊的元素
>>> q = deque(maxlen=5)
>>> q.append(1)
>>> q.append(2)
>>> q.append(3)
>>> q.append(4)
>>> q
deque([1, 2, 3, 4], maxlen=5)
>>> q.append(5)
>>> q
deque([1, 2, 3, 4, 5], maxlen=5)
>>> q.append(6)
>>> q
deque([2, 3, 4, 5, 6], maxlen=5)
q.popleft()
q.appendleft()
使用heapq.nlargest 和 heapq.nsmallest 靈活找出集合中最大或者最小的N個元素
import heapq def test_heapq():
nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
portfolio = [
{'name':'a', 'num':1},
{'name':'b', 'num':2},
{'name':'c', 'num':3},
{'name':'d', 'num':4},
{'name':'e', 'num':5}]
print (heapq.nlargest(3, nums))
print (heapq.nsmallest(3, nums))
#使用key和lambda復雜的排序
print (heapq.nsmallest(3, portfolio, key=lambda s:s['num']))
print (heapq.nlargest(3, portfolio, key=lambda s:s['num']))
- 如果只是找出最大或最小,N=1,那么使用max() min()是最適合的
如果N相對于總數來說,N很小,使用堆是最適合的,每次Pop都能pop出最小的元素
>>> import heapq
>>> num = [1, 2, 3, 4, 8, 5, 6]
>>> heapq.heapify(num)
>>> heapq.heappop(num)
1
>>> heapq.heappop(num)
2
>>> heapq.heappop(num)
3
使用defaultdict 可以自動初始化一對多字典的初始值
>>> from collections import defaultdict
>>> d = defaultdict(list)
>>> d['a'].append(1)
>>> d['a'].append(2)
>>> d['b'].append(3)
>>> d
defaultdict(<class 'list'>, {'b': [3], 'a': [1, 2]})
>>> d[a]
[1, 2]
>>> d = defaultdict(set)
>>> d['a'].add(1)
>>> d['a'].add(1)
>>> d['a'].add(2)
>>> d['b'].add(3)
>>> d
defaultdict(<class 'set'>, {'b': {3}, 'a': {1, 2}})
>>> d['a']
{1, 2}
>>>
控制字典迭代的順序
>>> from collections import OrderedDict
>>> d = OrderedDict()
>>> d['a'] = 1
>>> d['b'] = 2
>>> d['c'] = 3
>>> for key in d:
... print (key, d[key])
a 1
b 2
c 3
- 在構建json時也能控制順序
- OrderedDict內部維護了一個雙向列表,數據量大的時候需要考慮內存開銷
>>> import json
>>> json.dumps(d)
'{"a": 1, "b": 2, "c": 3}'
字典的排序,極值
- 使用zip對反轉鍵值對,使用max或者min找出最大最小值。zip出來的迭代器只能被消費一次,不能重復使用
- min max 對元組進行比較,會先對values進行比較,而后才是key
>>> price = {'ACME':45.23, 'AAPL':712.78, 'FB':11}
>>> price_zip = zip(price.values(), price.keys())
>>> min(price_zip)
(11, 'FB')
>>> max(price_zip)
Traceback (most recent call last):
File "<input>", line 1, in <module>
max(price_zip)
ValueError: max() arg is an empty sequence
- 不使用zip,在min max 時候指定key.內部排序在遍歷dict時使用pirce[k]即value進行排序返回key
>>> price = {'ACME':45.23, 'AAPL':712.78, 'FB':11, 'IBM':205.55}
>>> min(price, key=lambda k:price[k])
'FB'
在兩個字典中尋找相同點
>>> a = {'x':1, 'y':2, 'z':3}
>>> b = {'w':10, 'x':11, 'y':2}
# Find keys in common
>>> a.keys() & b.keys()
{'y', 'x'}
# Find keys in a that are not in b
>>> a.keys() - b.keys()
{'z'}
# Find key,value pairs in a that not in b
>>> a.items() - b.items()
{('x', 1), ('z', 3)}
刪除序列相同的元素并保持元素順序
- yield 返回了一個生成器
- 不考慮保持元素順序,可以使用set去除相同的元素
def dedupe(items):
seen = set()
for item in items:
if item not in seen:
yield item
seen.add(item)
if __name__ == '__main__':
a = [1, 5, 2, 1, 9, 1, 5, 10]
print (dedupe(a))
for i in dedupe(a):
print (i)
命名切片slice
>>> a = slice(0, 10, 2)
>>> c = 'dsfdffdfdfdfd'
>>> c[a]
'dffdd'
>>> a = slice(0, 10)
>>> c[a]
'dsfdffdfdf'
- 調用indices可以返回一個切片序列的元組,用于迭代切片
- 使用*將元組放入range
>>> s = 'HelloWorld'
>>> for i in range(*slice(5, 10, 2).indices(10)):
... print (i)
... print (s[i])
5
W
7
r
9
d
統計速序中出現次數最多的元素
>>> words = [
... 'look', 'into', 'my', 'eyes', 'look', 'into', 'my', 'eyes',
... 'the', 'eyes', 'the', 'eyes', 'the', 'eyes', 'not', 'around', 'the',
... 'eyes', "don't", 'look', 'around', 'the', 'eyes', 'look', 'into',
... 'my', 'eyes', "you're", 'under'
... ]
>>> from collections import Counter
>>> word_counts = Counter(words)
>>> top_three = word_counts.most_common(3)
>>> print (top_three)
[('eyes', 8), ('the', 5), ('look', 4)]
>>> top_three = word_counts.most_common(1)
>>> print (top_three)
[('eyes', 8)]
>>> word_counts['not']
1
>>> word_counts['eyes']
8
- Counter也可以進行數學運算,在制表和計數數據的場合很有用
>>> a = Counter(words)
>>> a
Counter({'eyes': 8, 'the': 5, 'look': 4, 'into': 3, 'my': 3, 'around': 2, 'not
': 1, "don't": 1, "you're": 1, 'under': 1})
>>> morewords = ['eyes', 'eyes', 'look']
>>> b = Counter(morewords)
>>> c = a + b
>>> c
Counter({'eyes': 10, 'look': 5, 'the': 5, 'into': 3, 'my': 3, 'around': 2, 'no
t': 1, "don't": 1, "you're": 1, 'under': 1})
>>> d = a - b
>>> d
Counter({'eyes': 6, 'the': 5, 'look': 3, 'into': 3, 'my': 3, 'around': 2, 'not
': 1, "don't": 1, "you're": 1, 'under': 1})
通過某個關鍵字排序字典序列
>>> rows = [
... {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
... {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
... {'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
... {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}
... ]
>>> from operator import itemgetter
>>> itemgetter('fname')
operator.itemgetter('fname')
>>> print (sorted(rows, key=itemgetter('fname')))
[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname':'Jones', 'uid': 1003}, {'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]
>>> print (sorted(rows, key=itemgetter('lname', 'fname')))
[{'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}, {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}]
排序不支持原生比較的class
- lambda 和 attregetter 都可以使用
- 如果要對比多個字段,需要使用attrgetter
from operator import attrgetter
class User(object):
def __init__(self, user_id):
self.user_id = user_id
def __repr__(self):
return 'User({})'.format(self.user_id)
def sort_notcompare():
users = [User(23), User(3), User(99)]
print(users)
print (sorted(users, key=lambda u:u.user_id))
print (sorted(users, key=attrgetter('user_id')))
根據dict字段將記錄分組,可以分組迭代
- 先根據date字段對rows排序
- 排序后groupby分組,會返回分組的字段值和分組內容的迭代器
from operator import itemgetter
from itertools import groupby
def group_by(rows):
#根據date對dict排序
rows.sort(key=itemgetter('date'))
for date, items in groupby(rows, key=itemgetter('date')):
print (date)
#生成迭代器
for i in items:
print (i)
if __name__ == '__main__':
rows = [
{'address': '5412 N CLARK', 'date': '07/01/2012'},
{'address': '5148 N CLARK', 'date': '07/04/2012'},
{'address': '5800 E 58TH', 'date': '07/02/2012'},
{'address': '2122 N CLARK', 'date': '07/03/2012'},
{'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
{'address': '1060 W ADDISON', 'date': '07/02/2012'},
{'address': '4801 N BROADWAY', 'date': '07/01/2012'},
{'address': '1039 W GRANVILLE', 'date': '07/04/2012'},
]
group_by(rows)
篩選序列元素
>>> mylist = [1, 4, -5, 10, -7, 2, 3, -1]
>>> [n for n in mylist if n > 0 ]
[1, 4, 10, 2, 3]
>>>
>>> mylist = [1, 4, -5, 10, -7, 2, 3, -1]
>>> chip_neg = [n if n > 0 else 0 for n in mylist]
>>> chip_neg
[1, 4, 0, 10, 0, 2, 3, 0]
>>> mylist = [1, 4, -5, 10, -7, 2, 3, -1]
>>> pos = (n for n in mylist if n > 0 )
>>> pos
<generator object <genexpr> at 0x10e5eab48>
>>> for i in pos:
... print(i)
1
4
10
2
3
- 如果篩選過程復雜,涉及異常處理,可以將篩選過程放在函數中,通過 filter 處理,此函數會將列表里面的內容依次作用于函數,根據True和False來決定是否保留
-
filter 返回一個迭代器
def is_int(val):
try:
x = int(val)
return True
except ValueError:
return False
if __name__ == '__main__':
values = ['1', '2', '-3', '-', 'N/A', '5']
ivals = list(filter(is_int, values))
print (ivals)
-
itertools.compress() 篩選
-
compress 返回一個迭代器,需要傳入篩選列表的布爾表達式,此函數會篩選出True的值,常用于把一個序列的值施加到另一個序列上
>>> addresses = [
... '5412 N CLARK',
... '5148 N CLARK',
... '5800 E 58TH',
... '2122 N CLARK',
... '5645 N RAVENSWOOD',
... '1060 W ADDISON',
... '4801 N BROADWAY',
... '1039 W GRANVILLE',
... ]
>>> counts = [ 0, 3, 10, 4, 1, 7, 6, 1]
>>> from itertools import compress
>>> more5 = [n>5 for n in counts]
>>> more5
[False, False, True, False, False, True, True, False]
>>> list(compress(addresses, more5))
['5800 E 58TH', '1060 W ADDISON', '4801 N BROADWAY']
字典中提取子集
>>> prices = {
... 'ACME': 45.23,
... 'AAPL': 612.78,
... 'IBM': 205.55,
... 'HPQ': 37.20,
... 'FB': 10.75
... }
>>> p1 = {key:value for k, v in prices.items() if v > 200 }
>>> p1 = {k:v for k, v in prices.items() if v > 200 }
>>> p1
{'AAPL': 612.78, 'IBM': 205.55}
>>> tech_names = {'AAPL', 'IBM', 'HPQ', 'MSFT'}
>>> p2 = {k:v for k, v in prices.items() if k in tech_names }
>>> p2
{'HPQ': 37.2, 'AAPL': 612.78, 'IBM': 205.55}
>>> p1 = dict((k, v)for k, v in prices.items() if v > 200)
>>> p1
{'AAPL': 612.78, 'IBM': 205.55}
映射名稱到序列的元素
- 通過為通過下標訪問的序列構造名字,通過名字來訪問該元素,使用命名元組提高代碼的可讀性
>>> from collections import namedtuple
>>> sub = namedtuple('sub', ['name', 'age'])
>>> c.name
'bob'
>>> c.age
'11'
- 可以使用命名元祖替代字典,命名元組不能直接賦值,可以使用_replace方法替換并重新生成一個命名元組
>>> sub._replace(age='12')
info(name='bob', age='12')
- _replace() 方法還有一個很有用的特性就是當你的命名元組擁有可選或者缺失字段時候,它是一個非常方便的填充數據的方法。
>>> info = namedtuple('info', ['name','age', 'other'])
>>> info_1 = info(None, None, None)
>>> def replace(s):
... return info_1._replace(**s)
>>>
>>> replace({'name':'bb', 'age':12, 'other':'111'})
info(name='bb', age=12, other='111')
轉換并同時計算數據
>>> s = sum((x * x for x in nums)) #平方和
>>> s
55
import os
files = os.listdir('.')
#any 任意一個為true就成立
if any(name.endswith('.py') for name in files):
print('There be python')
else:
print ('Sorry, no python')
s = ('ACME', 50, 123.45)
print (','.join(str(x) for x in s))
test = [
{'name':'GOOG', 'shares':11},
{'name':'GOOG', 'shares':75},
{'name':'GOOG', 'shares':11}
]
#使用生成器作為函數參數可以不用重復使用括號
print (min(s['shares'] for s in test))
print (min((s['shares'] for s in test)))
#可以加入Key 配合匿名函數使用
print (min(test, key = lambda s:s['shares']))
多個映射合并為單個映射
- 檢查a,b 字典,a如果沒有去b中找
- ChainMap 會重新建立映射,重新定義常見的字典操作來進行操作
- 如果有重復的映射,只會使用第一個出現的映射
- ?修改映射的值只會作用于第一個映射結構
>>> a = {'x':1, 'z':3}
>>> b = {'y':2, 'z':4}
>>> from collections import ChainMap
>>> c = ChainMap(a, b)
>>> c
ChainMap({'x': 1, 'z': 3}, {'y': 2, 'z': 4})
>>> print (c['x'])
1
>>> print (c['y'])
2
>>> print (c['z'])
3
>>> list(c.keys())
['x', 'z', 'y']
>>> list(c.values())
[1, 3, 2]
>>> c['z'] = 10
>>> c
ChainMap({'x': 1, 'z': 10}, {'y': 2, 'z': 4})
- 可以使用dict 的update方法單獨構造一個新字典,但如果存在相同的key新生成的字典只會有一個key,而且對原始數據進行修改的?,不會反應到新生成的字典上,使用ChainMap就可以實現
>>> a = {'x':1, 'z':3}
>>> b = {'y':2, 'z':4}
>>> maerged = ChainMap(a, b)
>>> maerged
ChainMap({'x': 1, 'z': 3}, {'y': 2, 'z': 4})
>>> maerged['x']
1
>>> a['x'] = 999
>>> maerged['x']
999