day19-Common module IV (re, typing)

re module

Used to find something specific from a string (text)

1. Meta characters: characters with special meaning

  • ^ matches from the beginning
import re
a = re.findall('^abc', 'abcsds')
b = re.findall('^abc', 'aabcsds')  # 不是以abc开头,所以返回空
print(a,b)
['abc'] []
  • $ matches from the end
a = re.findall('abc$', 'sdfabcsdsabc')
b = re.findall('abc$', 'aabcsdsbc')  # 不是以abc结尾,所以返回空
print(a,b)
['abc'] []
  • | is equivalent to or
a = re.findall('a|bc', 'sdfabbcsdsabc')  # 将匹配到的对象用列表的形式返回
print(a)
['a', 'bc', 'a', 'bc']
  • [] find the element within []
a = re.findall('[bac]', 'sdfabcsdsabc')
print(a)
['a', 'b', 'c', 'a', 'b', 'c']
  • [^] Negate, match the characters except [^], if the ^ metacharacter is written into the character set, it will be reversed
a = re.findall('[^bac]', 'sdfabcsdsabc')
print(a)
['s', 'd', 'f', 's', 'd', 's']
  • () After finding the matching results, only take the ones in (), group matching
a = re.findall('a(bc)s', 'sdfabcsdsabc')
print(a)
['bc']
  • . means any character
a = re.findall('b.', 'sdb,sdb sdkjfbasd sdb')  # 可表示任意字符,包括空格及其他字符
print(a)
['b,', 'b ', 'ba']
  • {n} matches the nearest first character before the brace n times
a = re.findall('ab{3}','abbbbsfsabbs dfbbb')
print(a)
['abbb']
  • * The preceding character matches 0 to infinity
a = re.findall('sa*','fsa dsaasdf')
print(a)
['sa', 'saa', 's']
  • + the preceding character matches 1 to infinity
a = re.findall('a+','fsa dsaasdf')  # 至少要匹配到一个a
print(a)
['a', 'aa']
  • ? The preceding character matches 0 or 1
a = re.findall('sa?','fsa dsaasdf')  # 匹配0或一个a
print(a)
['sa', 'sa', 's']

2. Predefined characters: backslash followed by ordinary characters to achieve special functions

  • \d matches digits (0-9)
a = re.findall('\d', 'sda123jf 342 4sdf4')
print(a)
['1', '2', '3', '3', '4', '2', '4', '4']
  • \D matches non-digit characters
a = re.findall('\D', 'sda123jf 342 4sdf4')
print(a)
['s', 'd', 'a', 'j', 'f', ' ', ' ', 's', 'd', 'f']
  • \s matches the null character
a = re.findall('\s', 'sda123jf 342 4sd,f4')
print(a)
[' ', ' ']
  • \S matches non-empty characters
a = re.findall('\S', 'sda123jf 342 4sd,f4')
print(a)
['s', 'd', 'a', '1', '2', '3', 'j', 'f', '3', '4', '2', '4', 's', 'd', ',', 'f', '4']
  • \w matches letters, numbers, underscores
a = re.findall('\w', 'sd_f 34?2 4sd,f4')
print(a)
['s', 'd', '_', 'f', '3', '4', '2', '4', 's', 'd', 'f', '4']
  • \W matches non-alphanumeric, non-numeric, non-underscore characters
a = re.findall('\W', 'sd_f 34?2 4sd,f4')
print(a)
[' ', '?', ' ', ',']

3. Greedy matching: keep looking until it is not satisfied

a = re.findall('a.*', 'asda123456asa')
print(a)
['asda123456asa']

4. Non-greedy matching, stop when one is found, ? equivalent to stop

a = re.findall('a.*?', 'asda123456asa')
print(a)
['a', 'a', 'a', 'a']

5. Commonly used functions

  • re.complie is equivalent to writing a general rule template
phone_compile = re.compile('1\d{10}')

email_compile = re.compile('\w+@\w+.\w+')

test_s = '12345678900  [email protected]  [email protected]'
res = phone_compile.findall(test_s)
print(res)

res = email_compile.findall(test_s)
print(res)
['12345678900']
['[email protected]', '[email protected]']
  • re.match matches from the beginning and takes a matched
a = re.match('\d','sdf123sdd456')
b = re.match('\d','123sdfa 212d')
print(a)
print(b)
None
<_sre.SRE_Match object; span=(0, 1), match='1'>
  • re.search searches for the first character that matches and returns its index
a = re.search('\d','sdfs1213hfjsf 2323')
print(a)
<_sre.SRE_Match object; span=(4, 5), match='1'>

The difference between match and search: mathch matches from the beginning to find one, and search searches all to find the first one

  • re,split splits the string according to regular matching and returns a split list
s = 'asb sfsl sfjwo212 12312,dsfsf'
print(s.split(' '))

res = re.split('\d+',s)
print(res)
['asb', 'sfsl', 'sfjwo212', '12312,dsfsf']
['asb sfsl sfjwo', ' ', ',dsfsf']
  • re, sub and re.subn are both replacement content, but subn will count how many times the replacement is made, similar to the replace built-in method of strings
import re

s = 'asfhf12fdgds 743wiuw22'

print(re.sub('\d',',',s))

print(re.subn('\d',',',s))  # 除了会修改内容,还会返回修改了多少次
asfhf,,fdgds ,,,wiuw,,
('asfhf,,fdgds ,,,wiuw,,', 7)

typing module

1. Type checking to prevent inconsistencies between parameter and return value types at runtime.

2. As an additional description of the development document, it is convenient for users to pass in and return parameter types when calling.

3. After the module is added, it will not affect the operation of the program, and will not report a formal error, only a reminder.

  • Note: The typing module can only be used in versions above python3.5, pycharm currently supports typing checks
from typing import List, Tuple, Dict
def add(a: int, string: str, f: float,
        b: bool) -> Tuple[List, Tuple, Dict, bool]:
    list1 = list(range(a))
    tup = (string, string, string)
    d = {"a": f}
    bl = b
    return list1, tup, d, bl
print(add(5, "hhhh", 2.3, False))

Crawl audio

import re
import requests

response = requests.get('http://www.gov.cn/premier/index.htm')
data = response.text

res = re.findall('href="(/\w+/\w+_yp.htm)"', data)  # ()只取括号内的
yp_res = 'http://www.gov.cn' + res[0]

yp_response = requests.get(yp_res)
yp_data = yp_response.text

res = re.findall('<a href="(.*?)"', yp_data)
count = 0
for url in res:
    if url == 'javascript:;':
        continue
    mp3_url = 'http://www.gov.cn' + url

    mp3_response = requests.get(mp3_url)
    mp3_response.encoding = 'utf8'  # 改变网址的utf8
    mp3_data = mp3_response.text
    # print(mp3_data)

    res = re.findall('<title>(.*?)</title>|data-src="(.*?)"',mp3_data)
    title = res[0][0]
    mp3_url = res[1][1]
    if res[1][1].startswith('/home'):
        continue

    res_response = requests.get(mp3_url)
    mp3_data = res_response.content  # MP3的二进制形式

    with open(f'{title}.mp3','wb') as fw:
        fw.write(mp3_data)
        fw.flush()
    count += 1
    print(f'{count}')