o
     hz                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
mZ d dlZd dlmZ ejd d dlmZ G dd deZdS )	    N)	b64decode	b64encode)urlparsequote)PyQueryz..)Spiderc                   @   s   e Zd Zd7ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zd8ddZd8ddZdd Zdd  Zd9d"d#Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd7d.d/Zd0d1 Zd2d3 Zd4d5 Zd6S ):r    c                 C   s~   zt || _W n   i | _Y dddddd| _|  | _| j| j| j dd | d	| j  td	| j  d S )
NzuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36ztext/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7zzh-CN,zh;q=0.9z
keep-alivezno-cache)z
User-AgentAcceptzAccept-Language
ConnectionzCache-Control/)OriginRefereru   使用站点: )	jsonloadsproxiesheadersget_working_hosthostupdatelogprint)selfextend r   ,   /storage/emulated/0/lz/py/sy/今日看料.pyinit   s   
zSpider.initc                 C      dS )Nu   🌈 今日看料r   r   r   r   r   getName%      zSpider.getNamec                    s   t  fdddD S )Nc                 3   s    | ]	}| pd v V  qdS )r   Nr   .0exturlr   r   	<genexpr>*   s    z'Spider.isVideoFormat.<locals>.<genexpr>).m3u8.mp4z.ts)any)r   r$   r   r#   r   isVideoFormat(   s   zSpider.isVideoFormatc                 C   r   )NFr   r   r   r   r   manualVideoCheck,   r   zSpider.manualVideoCheckc                 C   s   d S Nr   r   r   r   r   destroy/   r   zSpider.destroyc                    s  z7t j| j| j| jdd}|jdkrg g dW S | |j}i }g }g d}d}|D ]p}|| D ]g}	|		dp;d |	 
 }
 rs|
rs d	kss d
ssd  v ssd  v ssd  v ssd  v ssd  v ssd  v rtq3d v st fdddD r dr }nd  }||
|d d}q3q+|sg d}|D ]7}|| D ].}	|		dpd |	 
 }
 r|
rڈ d	krڈ drˈ }nd  }||
|d d}qqg }t }|D ]}|d |vr|| ||d  q|s)dddddddddd d!dd"d#dd$d%dd&d'dd(d)dd*d+dd,d-dd.d/dg}||d0< | |d1|d2< |W S  tyW } ztd3|  g g dW  Y d }~S d }~ww )4N   r   r   timeout   )classlist)z/#navbarCollapse .navbar-nav .nav-item .nav-linkz.navbar-nav .nav-item .nav-linkz#nav .menu-item az.menu .menu-item aFhrefr   #httpZaboutZcontacttagstopstarttimez
/category/c                 3       | ]}| v V  qd S r+   r   )r!   catr3   r   r   r%   W       z%Spider.homeContent.<locals>.<genexpr>)z/dy/z/ks/z/douyu/z/hy/z/hj/z/tt/z/wh/z/asmr/z/xb/z/xsp/z/rdgz/r   )	type_nametype_idT)z.category-list az .slide-toggle + .category-list az.menu .category-list ar?   u   热点关注z/category/rdgz/u   抖音z/category/dy/u   快手z/category/ks/u   斗鱼z/category/douyu/u   虎牙z/category/hy/u   花椒z/category/hj/u   推特z/category/tt/u   网红z/category/wh/ZASMRz/category/asmr/u   X播z/category/xb/u	   小视频z/category/xsp/r1   $#index article a, #archive article ar2   zhomeContent error: )requestsgetr   r   r   status_codegetpqtextitemsattrstrip
startswithlowerr(   appendsetaddgetlist	Exceptionr   )r   filterresponsedataresultclassesZnav_selectorsZfound_categoriesselectoritemnamer?   Zcategory_selectorsZunique_classesZseen_idsclser   r<   r   homeContent2   s   





zSpider.homeContentc              
   C   s   z&t j| j| j| jdd}|jdkrdg iW S | |j}d| |diW S  t	yC } zt
d|  dg iW  Y d }~S d }~ww )Nr-   r.   r0   r2   r@   zhomeVideoContent error: )rA   rB   r   r   r   rC   rD   rE   rN   rO   r   )r   rQ   rR   rY   r   r   r   homeVideoContent   s   

zSpider.homeVideoContentc              
   C   sZ  z| dd}|r|dkr| j | d| d}n| j | d}td|  tj|| j| jdd}|jdkrKtd|j  g |dd	d
dW S | 	|j
}| |d|}	|	sc| |d|}	tdt|	 d | ||}
i }|	|d< ||d< |
|d< d	|d< d|d< |W S  ty } ztd|  g |dd	d
dW  Y d }~S d }~ww )Nr   1u   分类页面URL: r-   r.   r0   u   分类页面请求失败:    Z   r   r2   page	pagecountlimittotal0#archive article a, #index article a, .post-card"article a, .post a, .entry-title a   找到 u
    个视频r2   r`   ra   rb   ?B rc   zcategoryContent error: lstriprstripr   r   rA   rB   r   r   rC   rD   rE   rN   lendetect_page_countrO   r   tidpgrP   r   base_urlr$   rQ   rR   videosra   rS   rY   r   r   r   categoryContent   s8   
zSpider.categoryContentc              
   C   sZ  z| dd}|r|dkr| j | d| d}n| j | d}td|  tj|| j| jdd}|jdkrKtd|j  g |dd	d
dW S | 	|j
}| |d|}	|	sc| |d|}	tdt|	 d | ||}
i }|	|d< ||d< |
|d< d	|d< d|d< |W S  ty } ztd|  g |dd	d
dW  Y d}~S d}~ww )u   标签页面内容r   r\   u   标签页面URL: r-   r.   r0   u   标签页面请求失败: r]   r^   r   r_   rd   re   rf   u    个标签相关视频r2   r`   ra   rb   rg   rc   ztagContent error: Nrh   rm   r   r   r   
tagContent   s8   
zSpider.tagContentc                 C   s  d}g }g d}|D ]H}||  D ]?}|dpd}|  }	|r>td|d}
|
r>t|
d}||vr>|	| |	rQ|	
 rQt|	}||vrQ|	| qq
|rbt|}td|  |S g d	}|D ]}||rutd
  dS qht|ddk rtd t|S td dS )u   改进的页数检测方法i )z.page-navigator az.pagination az.pages az.page-numbers ar3   r   z	/(\d+)/?$r   r]   u#   从分页器检测到最大页码: )z.page-navigator .nextz.pagination .nextz
.next-pageu   a:contains("下一页")u-   检测到下一页按钮，允许继续翻页z,#archive article, #index article, .post-card   u-   当前页内容较少，可能没有下一页u   使用默认页数: 99999)rF   rG   rE   rH   researchrj   intgrouprK   isdigitmaxr   rk   )r   rR   Zcurrent_pagera   Zpage_numbersZpage_selectorsrU   Z	page_linkr3   rE   matchpage_numZmax_pageZnext_selectorsr   r   r   rl      sD   

zSpider.detect_page_countc              
      s0  z|d  ds| j |d  n|d }tj|| j| jdd}|jdkr2ddd| d	giW S | |j}d
di}g d}|D ]}||}|rT| 	 |d<  nqBd|vr]d|d< zQg }	|dr|d
 D ]/}
|
 }|
d}|r|r| | jr|| jd}|	dt||d d | d  qj|	rd|	n	|d p|d |d< W n
   |d |d< Y z
g }t }|dr6t|d
 ddD ]f\}}
|
d}|r5zTt|}|di dd}|r.d| }d}||v r
d| d | }|d7 }||v s|| | d!| d"|  td!| d"|  || d#|  W q   Y qq|sg d$}|D ]\}t||
 ddD ]N\}}|d%p\|dp\d  rt fd&d'd(D rd| }d}||v rd| d | }|d7 }||v sx|| || d#   qKq?|r| d)t| d* td)t| d* d+||d,< nd-| |d,< W n! ty } ztd.|  d-| |d,< W Y d }~nd }~ww d|giW S  ty } ztd/|  ddd0|r|d nd d	giW  Y d }~S d }~ww )1Nr   r5   r-   r.   r0   r2   u   今日看料u   页面加载失败$)vod_play_fromvod_play_urlr}   )z.post-titlezh1.entry-titleh1.post-card-titlevod_nameu   今日看料视频z.tags .keywords ar3   r   z[a=cr:)idrW   z/]z[/a] z.post-contentvod_contentz.dplayerr]   )r8   zdata-configvideor$   u   视频   _u   解析到视频: z -> $)zvideo sourcer   ziframe[src*="video"]za[href*=".m3u8"]za[href*=".mp4"]srcc                 3   r:   r+   r   r    r   r   r   r%   |  r=   z'Spider.detailContent.<locals>.<genexpr>)r&   r'   r   u   拼装播放列表，共u   个r4   r~   u   正片$u   视频解析错误: zdetailContent error: u   详情页加载失败$)rI   r   rA   rB   r   r   rC   rD   rE   rH   rF   rG   replacerK   r   dumpsjoinrL   	enumerater   rM   r   r   r(   rk   rO   )r   idsr$   rQ   rR   vodZtitle_selectorsrU   Z
title_elemZclistktitler3   plistZ
used_namescZconfig_attrconfig	video_urlrW   countZvideo_selectorselemrY   r   r   r   detailContent6  s   (

&*









0zSpider.detailContentr\   c              
   C   s&  ztt |}|dkr| j d| d| n| j d| d}tj|| j| jdd}|jdkrP|dkr<| j d| d| n| j d| d}tj|| j| jdd}|jdkr[g |dW S | |j}| 	|d	}| 
||}	|||	d
W S  ty }
 ztd|
  g |dW  Y d }
~
S d }
~
ww )Nr\   /tag/r   r-   r.   r0   z/search/r2   r`   rd   )r2   r`   ra   zsearchContent error: )r   r   rA   rB   r   r   rC   rD   rE   rN   rl   rO   r   )r   keyquickro   encoded_keyr$   rQ   rR   rq   ra   rY   r   r   r   searchContent  s$   0
0
zSpider.searchContentc                 C   sX  z| j  d}tj|| j| jdd}|jdkrg |dW S | |j}g }|d D ]=}| 	 }|
dp9d}|rh|rhd	|v rh|d
krh|| j d}	|	dsVd|	 }	||	d| ddddddd q+tdt| d i }
||
d< ||
d< d|
d< d|
d< t||
d< |
W S  ty } ztd|  g |dW  Y d}~S d}~ww )u   获取标签页面内容z
/tags.htmlr-   r.   r0   r   za[href*="/tag/"]r3   r   r   u   全部标签r   u   🏷️ u   标签tagrectHzG?typeratiovod_idr   vod_picvod_remarksZvod_tagstylerf   u
    个标签r2   r`   r]   ra   i  rb   rc   zgetTagsContent error: N)r   rA   rB   r   r   rC   rD   rE   rF   rH   rG   r   rI   rK   r   rk   rO   )r   ro   r$   rQ   rR   r6   Ztag_elemtag_nameZtag_hrefZtag_idrS   rY   r   r   r   getTagsContent  sH   

	zSpider.getTagsContentc                 C   s`   |}d}|  |rd|v r| |}d}| d| d|  td| d|  ||| jdS )Nr]   r&   r   u   播放请求: parse=z, url=)parser$   header)r)   proxyr   r   r   )r   flagr   vipFlagsr$   pr   r   r   playerContent  s   

zSpider.playerContentc              
   C   s   zV| ddkr@| |d }|ds)|dr!| j | }n| j d| }tj || j| jdd}d|j d	d
|jgW S | ddkrO| |d W S | 	|d W S  t
y{ } ztd|  dddt|  gW  Y d }~S d }~ww )Nr   imgr$   zhttp://zhttps://r   
   r.   r0   Content-Typez
image/jpegm3u8zlocalProxy error:   
text/plainzProxy error: )rB   d64rI   r   rA   r   r   contentm3ProxytsProxyrO   r   strencode)r   paramimg_urlresrY   r   r   r   
localProxy  s"   

$zSpider.localProxyr   c                 C   s0   |rt | jr|   d| | d| S |S )N&url=z&type=)rk   r   getProxyUrle64)r   rR   r   r   r   r   r     s   zSpider.proxyc              
   C   s  z|  |}tj|| j| jdd}|jd}|jdr0|jd }tj|| j| jdjd}| d}|d |	d }t
|}|jd |j }d	}t|D ]]\}	}
|r}d
|
v r}d}t||
}|r}t|d| |dd d|
||	< d}qRd|
vrd|
vr|
ddk r|n|}||
drdnd |
 }
| |
|
dd dd ||	< qRd|}dd|gW S  ty } ztd|  dddt|  gW  Y d }~S d }~ww )NF)r   r   allow_redirectsutf-8Location)r   r   
r   z://TURIzURI="([^"]*)"zURI="r]   Zmkey"z#EXTr5   r   r   .?r   r0   zapplication/vnd.apple.mpegurzm3Proxy error: r   r   zm3u8 proxy error: )r   rA   rB   r   r   r   decoderH   splitrfindr   schemenetlocr   ru   rv   subr   rx   r   rI   r   rO   r   r   r   )r   r$   ZydatarR   linesZlast_r
parsed_urlZdurlZiskeyindexstringpatternr{   domainrY   r   r   r   r     sB   

($
$zSpider.m3Proxyc              
   C   s   z|  |}tj|| j| jdd}d|jdd|jgW S  tyA } ztd|  ddd	t| 	 gW  Y d }~S d }~ww )
NT)r   r   streamr0   r   z
video/mp2tztsProxy error: r   r   zts proxy error: )
r   rA   rB   r   r   r   rO   r   r   r   )r   r$   rR   rY   r   r   r   r   %  s   
$zSpider.tsProxyc              
   C   X   z| d}t|}|dW S  ty+ } ztdt|  W Y d }~dS d }~ww )Nr   u   Base64编码错误: r   )r   r   r   rO   r   r   )r   rE   
text_bytesencoded_bytesrY   r   r   r   r   .     
z
Spider.e64c              
   C   r   )Nr   u   Base64解码错误: r   )r   r   r   rO   r   r   )r   encoded_textr   decoded_bytesrY   r   r   r   r   7  r   z
Spider.d64c                 C   s   g d}|D ]J}z6t j|| j| jdd}|jdkr=| |j}|d}t|dkr=| d|  t	d|  |W   S W q t
yP } zW Y d}~qd}~ww | d	|d   t	d	|d   |d S )
z(Get working host from known dynamic URLs)zhttps://kanliao25.com//zhttps://kanliao7.org/zhttps://kanliao7.net/zhttps://kanliao14.com/r   r.   r0   r@   r   u   选用可用站点: Nu#   未检测到可用站点，回退: )rA   rB   r   r   rC   rD   rE   rk   r   r   rO   )r   Zdynamic_urlsr$   rQ   rR   ZarticlesrY   r   r   r   r   @  s&   

zSpider.get_working_hostc           	      C   s   g }|  D ]q}|d}|d p"|d p"|d p"| }|d p.|d }| |r<td|  q|rw|rw| rw|dsW|d	rQ|}nd	| }n|}|||d
d | 	||rm| ndddddd q|S )Nr3   h2r   z.entry-titlezspan[itemprop="datePublished"]z..post-meta, .entry-meta, time, .post-card-infou   过滤广告: r5   r   r   r   r   r   r   r   r   )
rF   rG   rE   is_advertisementr   rH   rI   rK   r   get_article_img)	r   rR   rn   rq   r   abr   r   r   r   r   rN   Z  s0   
,


zSpider.getlistc                    s   | d}| D ]}d| v r dS q	|d p"|d p"dg d}tfdd	|D r4dS |d
p:d d v r[t fdd	dD r[ddg}t fdd	|D r[dS dS )u0   判断是否为广告（包含热搜HOT标志）z.wraps	   热搜HOTTr   r   r   )r   u   手机链接u	   DNS设置u	   修改DNSu
   WIFI设置c                 3   r:   r+   r   )r!   keyword)r   r   r   r%     r=   z*Spider.is_advertisement.<locals>.<genexpr>r   zbackground:c                 3   r:   r+   r   r!   Zgradientr   r   r   r%     r=   )z-webkit-linear-gradientzlinear-gradientz#ec008c,#fc6767z#ffe259,#ffa751c                 3   r:   r+   r   r   r   r   r   r%     r=   F)findrF   rE   r(   rG   )r   article_elemZhot_elementsr   Zad_keywordsZad_gradientsr   )r   r   r   r   z  s   
zSpider.is_advertisementc                 C   s  |d  }|r<td|}|r<|d}|ds/|dr'| j | }n| j d| }|   d| | dS |d}|r|	d	pId
}td|}|r|d}|r|ds|dsx|drp| j | }n| j d| }|   d| | dS |d}	|	r|		d}
|
r|
ds|
dr| j |
 }
n| j d|
 }
|   d| |
 dS |		d}|r|ds|dr| j | }n| j d| }|   d| | dS d
S )u3   从文章元素中提取图片，多种方式尝试scriptzloadBannerDirect\('([^']+)'r]   r   r   r   z	&type=imgz.blog-backgroundr   r   z2background-image:\s*url\(["\']?([^"\'\)]+)["\']?\)zdata:r   zdata-srcr   )
rE   ru   rv   rx   rI   r   r   r   r   rG   )r   r   script_textr{   r$   Zbg_elemr   Zbg_matchr   Zimg_elemZdata_srcr   r   r   r   r     sN   













zSpider.get_article_imgc              
   C   sL   zt |W S  ty% } ztt|  t |dW  Y d }~S d }~ww )Nr   )pqrO   r   r   r   )r   rR   rY   r   r   r   rD     s   
zSpider.getpqN)r   )r\   )r   )__name__
__module____qualname__r   r   r)   r*   r,   rZ   r[   rr   rs   rl   r   r   r   r   r   r   r   r   r   r   r   rN   r   r   rD   r   r   r   r   r      s4    
j()>
`
/
 			
 5r   )r   randomru   sys	threadingr9   base64r   r   urllib.parser   r   rA   pyqueryr   r   pathrK   base.spiderr   r   r   r   r   <module>   s   