天猫闲谈

遇到的问题和解决方案

SSL报错01

1
2
3
  File "C:\ProgramData\Anaconda3\lib\site-packages\requests\adapters.py", line 512, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:833)

使用了anaconda,貌似win10 anaconda python中SSL是无法正常使用的

SSL报错02

1
2
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='detail.tmall.com', port=443): Max retries exceeded with url: /item.htm?spm=a220m.1000858.1000725.6.279262bdaFcN4o&id=644251278102&skuId=4811506167531&areaId=430500&user_id=3695246029&cat_id=2&is_b=1&rn=fbac5ad264b425e565a06fe73f8dddc5 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))

把代理和vpn都关掉就刑了

遇到\u0026开头的数据

在线网站解码uincode

[网站链接](Unicode编码解码 (bt.cn))

image-20220719210902845

代码解码:

1
2
3
str1 = '\u003d676464506778\u0026ns\u003d1\u0026abbucket\u003d20'
str1 = str1.encode('utf8').decode('unicode_escape')
print(str1)

JSON形式数据无法格式化

[网站推荐](JSON在线 | JSON解析格式化—SO JSON在线工具)

主页数据获取

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import random
import requests
import re
from lxml import etree
headers = {}
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ",
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 思路 先通过re正则表达式截取字符串 再转成json 提取后的detail_url 需要unicode解码 随后请求二级页面 保存到数据库
headers['User-Agent'] = random.choice(user_agent_list)
headers['cookie'] = 'lid=tb9609426962; enc=BH1kwxit6G8cG+yyNm8RGSjGG7Ul6TPPC9cupRd7pcLOTHzbtGjUWvt+y1/lnpq20MnX/L2ib6fxGQgPVPErnDcEfbiBCMvSLjTaVe7Ke+Q=; xlly_s=1; t=1dc717b7817ec57908076132934639cf; tracknick=tb9609426962; lgc=tb9609426962; _tb_token_=e75e6f118fe1f; cookie2=1733e90542b3f845c3ae6495f5805686; dnk=tb9609426962; uc1=cookie21=U+GCWk/7pun3Y5w91g==&existShop=false&cookie14=UoexOtxinA35mw==&cookie15=W5iHLLyFOGW7aA==&cookie16=Vq8l+KCLySLZMFWHxqs8fwqnEw==&pas=0; uc3=id2=UUphw2zWTYMnUcGdIA==&lg2=U+GCWk/75gdr5Q==&nk2=F5RMHyzSuEkCcKo1&vt3=F8dCv4MTXkH24gx6sAo=; uc4=id4=0@U2grGNOhoAYQJI7x06Q9pDUuuKKF/Kxv&nk4=0@FY4HWrWzC/MqtVcYArH9YSEDnCASDHo=; _l_g_=Ug==; unb=2209950509527; cookie1=BYBYyEkdr9rg6r9BkSz4NbO8O9IxtK9O+uxD9+ms0+g=; login=true; cookie17=UUphw2zWTYMnUcGdIA==; _nk_=tb9609426962; sgcookie=E100hATU0w94N3lFcvzLxXKWpFR/t6l5etXuPZwTQZw4C5dK7IMPctrGWC/zr3BN9BMQ8hG+G5eX+4MwnfuJ9a//JxrxaolzbVqljAcBsKt/s8c=; cancelledSubSites=empty; sg=27d; csg=06c4cb83; cna=dxpdG1ujxikCAW416kuUQRzQ; _med=dw:1280&dh:720&pw:1920&ph:1080&ist:0; _m_h5_tk=b13d35cfa8eba44fe9e718d2ab0a8db6_1658233336473; _m_h5_tk_enc=ca8261f26c330ffca5b111d34d3154c2; x5sec=7b22746d616c6c7365617263683b32223a223865636565636234376335353662326338396537313536376232646232393135434e336532705947454c6a42394d4341694b7a754a426f504d6a49774f546b314d4455774f5455794e7a73784d4f5057684b62352f2f2f2f2f77453d227d; res=scroll:990*7119-client:497*561-offset:497*7119-screen:1280*720; pnm_cku822=098#E1hvIvvUvbZvjQCkvvvvvjiWRLLUsjl8n2s9zjrCPmPwsjtPRFLWsjinPFcwQj0evpvhvvmv9u9Cvv9vvUCCx++HVO9CvvwUvUVvwZjWKvhv8vvvvvCvpvvvvvvC9hCvmvZvvUUvphvUpQvv99CvpvkkvvmmvhCvm8UUvpCWCviPvvawafmxfX9Ojomxfa3lD1DlpqmxfwoOd5lPlnoOD40Owm0QD7zheTtYvtxr1WoKHkx/1WBlYCe4axRxfwLvd3ODN+LvaNpM+9vCvvOvCvvvphvRvpvhMMGvv29Cvvpvvvvv; tfstk=cSOGBdwdZdWs2wLyuC1siTHnulDRZ1iNbIR9TV1ISNQJVCOFiweUUjus-MoSb-1..; l=eBSVJC2cL0chGFjsBOfwourza77OSIRAguPzaNbMiOCPOc5H5XMfW6vz5eYMC3GVh6vDR3Wrj_IwBeYBqIY75O9StBALurkmn; isg=BOPj1cl9hRwO7kkrfnvS78pIcieN2HcaWtHIxxVAP8K5VAN2nagHasGGTiTadM8S'


# headers = {
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-language': 'zh-CN,zh;q=0.9',
# 'cache-control': 'no-cache',
# }
for n in range(1,30):
url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.752962bd7xoYBT&s=60&q=%C5%AE%CA%BF%B0%FC&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
resp = requests.get(url=url, headers=headers).text
# js_format = re.findall('"itemlist":(.*?),"bottomsearch":',resp)
# print(js_format)
html = etree.HTML(resp)
div_list = html.xpath('//div[@class="page"]/div/div[@id="content"]/div/div[@id="J_ItemList"]/div')
print(len(div_list))
for i,v in enumerate(div_list):
price = v.xpath('./div/p[@class="productPrice"]/em/text()')
sell_num = v.xpath('./div/p[@class="productStatus"]/span/em/text()')
shop_name = v.xpath('./div/div[@class="productShop"]/a/text()')
count_num = v.xpath('./div/p[@class="productStatus"]/span/a/text()')
print(price,sell_num,shop_name,count_num)
# print(resp)
print("end!!!")
break

运行截图:

image-20220719213614328

二级页面

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from lxml import etree
import random
import requests
import re
headers = {}
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ",
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 思路 先通过re正则表达式截取字符串 再转成json 提取后的detail_url 需要unicode解码 随后请求二级页面 保存到数据库
headers['User-Agent'] = random.choice(user_agent_list)
headers['cookie'] = 'lid=tb9609426962; enc=BH1kwxit6G8cG+yyNm8RGSjGG7Ul6TPPC9cupRd7pcLOTHzbtGjUWvt+y1/lnpq20MnX/L2ib6fxGQgPVPErnDcEfbiBCMvSLjTaVe7Ke+Q=; xlly_s=1; t=1dc717b7817ec57908076132934639cf; tracknick=tb9609426962; lgc=tb9609426962; _tb_token_=e75e6f118fe1f; cookie2=1733e90542b3f845c3ae6495f5805686; dnk=tb9609426962; uc1=cookie21=U+GCWk/7pun3Y5w91g==&existShop=false&cookie14=UoexOtxinA35mw==&cookie15=W5iHLLyFOGW7aA==&cookie16=Vq8l+KCLySLZMFWHxqs8fwqnEw==&pas=0; uc3=id2=UUphw2zWTYMnUcGdIA==&lg2=U+GCWk/75gdr5Q==&nk2=F5RMHyzSuEkCcKo1&vt3=F8dCv4MTXkH24gx6sAo=; uc4=id4=0@U2grGNOhoAYQJI7x06Q9pDUuuKKF/Kxv&nk4=0@FY4HWrWzC/MqtVcYArH9YSEDnCASDHo=; _l_g_=Ug==; unb=2209950509527; cookie1=BYBYyEkdr9rg6r9BkSz4NbO8O9IxtK9O+uxD9+ms0+g=; login=true; cookie17=UUphw2zWTYMnUcGdIA==; _nk_=tb9609426962; sgcookie=E100hATU0w94N3lFcvzLxXKWpFR/t6l5etXuPZwTQZw4C5dK7IMPctrGWC/zr3BN9BMQ8hG+G5eX+4MwnfuJ9a//JxrxaolzbVqljAcBsKt/s8c=; cancelledSubSites=empty; sg=27d; csg=06c4cb83; cna=dxpdG1ujxikCAW416kuUQRzQ; pnm_cku822=098#E1hvUpvUvbZvUpCkvvvvvjiWRLLUQjn8RLSvgjthPmPp6jnmPFLpljlhRF59QjtjR8OCvvpvvhHhRvhvCvvvphvvvpvVvUCvpvvvKvhv8vvvphvvvvvvvvCmpQvvv4vvvhxHvvvC4vvvBZZvvvHZvvCHBpvvvxoUvpvjvpC2p+Lve49Cvv9vvhj2ZnnXsQ9CvhQmp+fVjakK5uyTWDKt5BwsRfwrfuVHR4VzWkZnD70Oj8TZfvDr1EAK5dUf8KBlDf8rejOd+87JViIwaHFXSfpAhC3qVUcn+3mO5jIU29hvCPMMvvv=; _m_h5_tk=b13d35cfa8eba44fe9e718d2ab0a8db6_1658233336473; _m_h5_tk_enc=ca8261f26c330ffca5b111d34d3154c2; isg=BCEhGC7I5yaGiEvRiK1wxZRWMO07zpXAVP9KXYPxaSi06kC8yR5TkOhrTB7sIi34; l=eBSVJC2cL0chG4RGBO5Zlurza77t6BOb8sPzaNbMiInca1rfTHWe_NCHUyYk7dtjgt5FrexPUcfRbdh6JSaLRFkDBeYCKXIpBbvy8e1..; tfstk=cwAfB72iPoqXNDn54tgrQGNAvU5GCBD5yxsyl02dXzRLKfg5f51meWSCAc9gV17OP'

url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.6.279262bdaFcN4o&id=644251278102&skuId=4811506167531&areaId=430500&user_id=3695246029&cat_id=2&is_b=1&rn=fbac5ad264b425e565a06fe73f8dddc5'
resp = requests.get(url=url, headers=headers).text
# print(resp)
try:
place = re.findall(r'<li title=.*?>质地:&nbsp;(.*?)</li>',resp)
place = ''.join(place)
except Exception as e:
place = ''
print(e)
try:
popular = re.findall(r'<li title=.*?>流行元素:&nbsp;(.*?)</li>',resp)
popular = ''.join(popular)
except Exception as e:
popular = ''
print(e)
# try:
# color = re.findall(r'<li title=.*?>颜色分类:&nbsp;(.*?)</li>',resp)
# color = ''.join(color)
# except Exception as e:
# color = ''
# print(e)
try:
up_time = re.findall(r'<li title=.*?>上市时间:&nbsp;(.*?)</li>',resp)
up_time = ''.join(up_time )
except Exception as e:
up_time = ''
print(e)
try:
style = re.findall(r"<li title=.*?>款式:&nbsp;(.*?)</li>",resp)
style = ''.join(style)
except Exception as e:
style = ''
print(e)
try:
size = re.findall(r"<li title=.*?>大小:&nbsp;(.*?)</li>",resp)
size = ''.join(size)
except Exception as e:
size = ''
print(e)
try:
popular_name = re.findall(r"<li title=.*?>流行款式名称:&nbsp;(.*?)</li>",resp)
popular_name = ''.join(popular_name)
except Exception as e:
popular_name = ''
print(e)
try:
sell_way = re.findall(r"<li title=.*?>销售渠道类型:&nbsp;(.*?)</li>",resp)
sell_way = ''.join(sell_way)
except Exception as e:
sell_way = ''
print(e)
try:
brand = re.findall(r'title="&nbsp;.*?>品牌:&nbsp;(.*?)</li>',resp)
brand = ''.join(brand)
except Exception as e:
brand = ''
print(e)
print(place,popular,style,up_time,size,popular_name,sell_way,brand)

运行截图:

image-20220719213736527

网页开发者工具源码展示:

image-20220719213948250

image-20220719214014226

源代码

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import random
import requests
import re,time
from lxml import etree
import mysql.connector
#连接数据库
mydb = mysql.connector.connect(
host = 'localhost',
user = 'root',
password = 'dlq668713',
database = 'love',
auth_plugin='mysql_native_password',
)
mycursor = mydb.cursor()
sql = 'INSERT INTO tm_bag (price,sell_num,shop_name,count_num,place, popular, style, up_time, size, popular_name, sell_way, brand) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ",
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ",
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ",
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ",
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def get_detail(url):
headers = {}
headers['User-Agent'] = random.choice(user_agent_list)
headers['cookie'] = 'lid=tb9609426962; enc=BH1kwxit6G8cG+yyNm8RGSjGG7Ul6TPPC9cupRd7pcLOTHzbtGjUWvt+y1/lnpq20MnX/L2ib6fxGQgPVPErnDcEfbiBCMvSLjTaVe7Ke+Q=; xlly_s=1; t=1dc717b7817ec57908076132934639cf; tracknick=tb9609426962; lgc=tb9609426962; _tb_token_=e75e6f118fe1f; cookie2=1733e90542b3f845c3ae6495f5805686; dnk=tb9609426962; uc1=cookie21=U+GCWk/7pun3Y5w91g==&existShop=false&cookie14=UoexOtxinA35mw==&cookie15=W5iHLLyFOGW7aA==&cookie16=Vq8l+KCLySLZMFWHxqs8fwqnEw==&pas=0; uc3=id2=UUphw2zWTYMnUcGdIA==&lg2=U+GCWk/75gdr5Q==&nk2=F5RMHyzSuEkCcKo1&vt3=F8dCv4MTXkH24gx6sAo=; uc4=id4=0@U2grGNOhoAYQJI7x06Q9pDUuuKKF/Kxv&nk4=0@FY4HWrWzC/MqtVcYArH9YSEDnCASDHo=; _l_g_=Ug==; unb=2209950509527; cookie1=BYBYyEkdr9rg6r9BkSz4NbO8O9IxtK9O+uxD9+ms0+g=; login=true; cookie17=UUphw2zWTYMnUcGdIA==; _nk_=tb9609426962; sgcookie=E100hATU0w94N3lFcvzLxXKWpFR/t6l5etXuPZwTQZw4C5dK7IMPctrGWC/zr3BN9BMQ8hG+G5eX+4MwnfuJ9a//JxrxaolzbVqljAcBsKt/s8c=; cancelledSubSites=empty; sg=27d; csg=06c4cb83; cna=dxpdG1ujxikCAW416kuUQRzQ; pnm_cku822=098#E1hvUpvUvbZvUpCkvvvvvjiWRLLUQjn8RLSvgjthPmPp6jnmPFLpljlhRF59QjtjR8OCvvpvvhHhRvhvCvvvphvvvpvVvUCvpvvvKvhv8vvvphvvvvvvvvCmpQvvv4vvvhxHvvvC4vvvBZZvvvHZvvCHBpvvvxoUvpvjvpC2p+Lve49Cvv9vvhj2ZnnXsQ9CvhQmp+fVjakK5uyTWDKt5BwsRfwrfuVHR4VzWkZnD70Oj8TZfvDr1EAK5dUf8KBlDf8rejOd+87JViIwaHFXSfpAhC3qVUcn+3mO5jIU29hvCPMMvvv=; _m_h5_tk=b13d35cfa8eba44fe9e718d2ab0a8db6_1658233336473; _m_h5_tk_enc=ca8261f26c330ffca5b111d34d3154c2; isg=BCEhGC7I5yaGiEvRiK1wxZRWMO07zpXAVP9KXYPxaSi06kC8yR5TkOhrTB7sIi34; l=eBSVJC2cL0chG4RGBO5Zlurza77t6BOb8sPzaNbMiInca1rfTHWe_NCHUyYk7dtjgt5FrexPUcfRbdh6JSaLRFkDBeYCKXIpBbvy8e1..; tfstk=cwAfB72iPoqXNDn54tgrQGNAvU5GCBD5yxsyl02dXzRLKfg5f51meWSCAc9gV17OP'
resp = requests.get(url=url, headers=headers).text
# print(resp)
try:
place = re.findall(r'<li title=.*?>质地:&nbsp;(.*?)</li>', resp)
place = ''.join(place)
except Exception as e:
place = ''
print(e)
try:
popular = re.findall(r'<li title=.*?>流行元素:&nbsp;(.*?)</li>', resp)
popular = ''.join(popular)
except Exception as e:
popular = ''
print(e)
try:
up_time = re.findall(r'<li title=.*?>上市时间:&nbsp;(.*?)</li>', resp)
up_time = ''.join(up_time)
except Exception as e:
up_time = ''
print(e)
try:
style = re.findall(r"<li title=.*?>款式:&nbsp;(.*?)</li>", resp)
style = ''.join(style)
except Exception as e:
style = ''
print(e)
try:
size = re.findall(r"<li title=.*?>大小:&nbsp;(.*?)</li>", resp)
size = ''.join(size)
except Exception as e:
size = ''
print(e)
try:
popular_name = re.findall(r"<li title=.*?>流行款式名称:&nbsp;(.*?)</li>", resp)
popular_name = ''.join(popular_name)
except Exception as e:
popular_name = ''
print(e)
try:
sell_way = re.findall(r"<li title=.*?>适用对象:&nbsp;(.*?)</li>", resp)
sell_way = ''.join(sell_way)
except Exception as e:
sell_way = ''
print(e)
try:
brand = re.findall(r'title="&nbsp;.*?>品牌:&nbsp;(.*?)</li>', resp)
brand = ''.join(brand)
except Exception as e:
brand = ''
print(e)
return place, popular, style, up_time, size, popular_name, sell_way, brand
def main(n):
headers = {}
headers['User-Agent'] = random.choice(user_agent_list)
headers['cookie'] = r'lid=tb9609426962; enc=BH1kwxit6G8cG+yyNm8RGSjGG7Ul6TPPC9cupRd7pcLOTHzbtGjUWvt+y1/lnpq20MnX/L2ib6fxGQgPVPErnDcEfbiBCMvSLjTaVe7Ke+Q=; xlly_s=1; t=1dc717b7817ec57908076132934639cf; tracknick=tb9609426962; lgc=tb9609426962; _tb_token_=e75e6f118fe1f; cookie2=1733e90542b3f845c3ae6495f5805686; dnk=tb9609426962; uc1=cookie21=U+GCWk/7pun3Y5w91g==&existShop=false&cookie14=UoexOtxinA35mw==&cookie15=W5iHLLyFOGW7aA==&cookie16=Vq8l+KCLySLZMFWHxqs8fwqnEw==&pas=0; uc3=id2=UUphw2zWTYMnUcGdIA==&lg2=U+GCWk/75gdr5Q==&nk2=F5RMHyzSuEkCcKo1&vt3=F8dCv4MTXkH24gx6sAo=; uc4=id4=0@U2grGNOhoAYQJI7x06Q9pDUuuKKF/Kxv&nk4=0@FY4HWrWzC/MqtVcYArH9YSEDnCASDHo=; _l_g_=Ug==; unb=2209950509527; cookie1=BYBYyEkdr9rg6r9BkSz4NbO8O9IxtK9O+uxD9+ms0+g=; login=true; cookie17=UUphw2zWTYMnUcGdIA==; _nk_=tb9609426962; sgcookie=E100hATU0w94N3lFcvzLxXKWpFR/t6l5etXuPZwTQZw4C5dK7IMPctrGWC/zr3BN9BMQ8hG+G5eX+4MwnfuJ9a//JxrxaolzbVqljAcBsKt/s8c=; cancelledSubSites=empty; sg=27d; csg=06c4cb83; cna=dxpdG1ujxikCAW416kuUQRzQ; _med=dw:1280&dh:720&pw:1920&ph:1080&ist:0; _m_h5_tk=b13d35cfa8eba44fe9e718d2ab0a8db6_1658233336473; _m_h5_tk_enc=ca8261f26c330ffca5b111d34d3154c2; res=scroll:990*7119-client:497*561-offset:497*7119-screen:1280*720; pnm_cku822=098#E1hv9QvUvbZvUpCkvvvvvjiWRLLUsjEmRLzZgjYHPmPvgjYWRLLh1jtERsMZlj3EPF9Cvvpvvvvv29hvCPMMvvmUvpvjmvmC9jHCs89Cvv9vvUCCxKeNCf9Cvm9vvvvvphvvvvvvvOCvpv9HvvmmvhCvmhWvvUUvphvUI9vv99CvpvkkmvhvLv2qGvvjn/An3w0AhjHCTWex6fItb9TxfwCl5dUf8z7+kEys+FUp+8c6zEAfwxzXS47BhC3qVUcnDOmOejIUDajxALwpEcqwaNoxdB9vvpvVvvpvvhCvi9hvCvvv9UU=; tfstk=c-iVBV0e_nKqn2gEzuZwCN-KgSaAZD4g8gyToBG54iQc8S4ci2CTEBR_UJBPsrf..; l=eBSVJC2cL0chGz92BO5Zlurza77tBIOb8sPzaNbMiInca69RtF670NCHUYlWSdtjgtfEYetPUcfRbdCq13Upgei9W5lBe82taY96-; isg=BLKy6xf0BBvYPzgk1_DzyOOzA_iUQ7bdA_rZiHyKpmVQD1MJZNN77Mxt_6uzfy51; x5sec=7b22746d616c6c7365617263683b32223a2235646430336337316634366132656461373162303239653737383936393134634349574232355947454b2b42753943457635627636774561447a49794d446b354e5441314d446b314d6a63374d54446a316f536d2b662f2f2f2f3842227d'
url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.752962bd7xoYBT&s={}&q=%C5%AE%CA%BF%B0%FC&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'.format(n*30)
# print(url)
time.sleep(0.5)
resp = requests.get(url=url, headers=headers).text
html = etree.HTML(resp)
div_list = html.xpath('//div[@class="page"]/div/div[@id="content"]/div/div[@id="J_ItemList"]/div')
print(len(div_list))
for i,v in enumerate(div_list):
print("进来了!!!")
data = []
try:
price = v.xpath('./div/p[@class="productPrice"]/em/text()')
price = ''.join(price)
except Exception as e:
price = ''
print("no price")
try:
sell_num = v.xpath('./div/p[@class="productStatus"]/span/em/text()')
sell_num = ''.join(sell_num).replace('+','').replace('笔','')
except Exception as e:
sell_num = ''
print("no sell_num")
try:
shop_name = v.xpath('./div/div[@class="productShop"]/a/text()')
shop_name = ''.join(shop_name).replace(r'\n','')
except Exception as e:
shop_name = ''
print("no shop_name")
try:
count_num = v.xpath('./div/p[@class="productStatus"]/span/a/text()')
count_num = ''.join(count_num)
except Exception as e:
count_num = ''
print("no count_num")
try:
href = v.xpath('./div/div[@class="productImg-wrap"]/a/@href')
href = ''.join(href)
href = r'https:'+href
except Exception as e:
href = ''
print("no href")
print(href)
# print(price,sell_num,shop_name,count_num,href)
try:
place, popular, style, up_time, size, popular_name, sell_way, brand = get_detail(href)
if brand == '':
place, popular, style, up_time, size, popular_name, sell_way, brand = get_detail(href)
if sell_way == '':
place, popular, style, up_time, size, popular_name, sell_way, brand = get_detail(href)
if popular_name == '':
place, popular, style, up_time, size, popular_name, sell_way, brand = get_detail(href)
except Exception as e:
print("详情页")
print(e)
print(place, popular, style, up_time, size, popular_name, sell_way, brand)
try:
data.append(price)
data.append(sell_num)
data.append(shop_name)
data.append(count_num)
data.append(place)
data.append(popular)
data.append(style)
data.append(up_time)
data.append(size)
data.append(popular_name)
data.append(sell_way)
data.append(brand)
data = tuple(data)
mycursor.execute(sql, data)
mydb.commit()
except Exception as e:
print("添加错误")
print(e)
time.sleep(0.5)
# print(resp)
print("end!!!")
if __name__ == '__main__':
t1 = time.time()
for n in range(1,2):
try:
print("正在爬第:"+str(n)+"页")
main(n)
except Exception as e:
print("first blood!")
if(n%14==0):
time.sleep(60)
t2 = time.time()
print("所耗时间:")
print(t2-t1)

很尴尬

详情页的总是有验证码反爬,以后再想想办法吧

目前是这个鬼样子:

image-20220719230132308