U
     hz                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
mZ d dlZd dlmZ ejd d dlmZ G dd deZdS )	    N)	b64decode	b64encode)urlparsequote)PyQueryz..)Spiderc                   @   s   e Zd Zd7ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zd8ddZd9ddZdd Zdd  Zd:d"d#Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd;d.d/Zd0d1 Zd2d3 Zd4d5 Zd6S )<r    c                 C   s   zt || _W n   i | _Y nX dddddd| _|  | _| j| j| j dd | d	| j  td	| j  d S )
NzuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36ztext/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7zzh-CN,zh;q=0.9z
keep-alivezno-cache)z
User-AgentAcceptzAccept-Language
ConnectionzCache-Control/)OriginRefereru   使用站点: )	jsonloadsproxiesheadersget_working_hosthostupdatelogprint)selfextend r   ,   /storage/emulated/0/lz/py/sy/今日看料.pyinit   s      
zSpider.initc                 C   s   dS )Nu   🌈 今日看料r   r   r   r   r   getName%   s    zSpider.getNamec                    s   t  fdddD S )Nc                 3   s   | ]}| pd kV  qdS )r   Nr   .0exturlr   r   	<genexpr>*   s     z'Spider.isVideoFormat.<locals>.<genexpr>).m3u8.mp4z.ts)any)r   r"   r   r!   r   isVideoFormat(   s    zSpider.isVideoFormatc                 C   s   dS )NFr   r   r   r   r   manualVideoCheck,   s    zSpider.manualVideoCheckc                 C   s   d S Nr   r   r   r   r   destroy/   s    zSpider.destroyc                    s  zt j| j| j| jdd}|jdkr2g g dW S | |j}i }g }ddddg}d	}|D ]}|| D ]}	|		d
pzd |	 
 }
 rj|
rj dksj dsjd  ksjd  ksjd  ksjd  ksjd  ksjd  krqjd kst fdddD rj dr }n
d  }||
|d d}qjqZ|sdddg}|D ]|}|| D ]h}	|		d
pvd |	 
 }
 rd|
rd dkrd dr }n
d  }||
|d d}qdqTg }t }|D ],}|d |kr|| ||d  q|spdd dd!d"dd#d$dd%d&dd'd(dd)d*dd+d,dd-d.dd/d0dd1d2dd3d4dg}||d5< | |d6|d7< |W S  tk
r } z td8|  g g d W Y S d }~X Y nX d S )9N   r   r   timeout   )classlistz/#navbarCollapse .navbar-nav .nav-item .nav-linkz.navbar-nav .nav-item .nav-linkz#nav .menu-item az.menu .menu-item aFhrefr   #httpZaboutZcontacttagstopstarttimez
/category/c                 3   s   | ]}| kV  qd S r)   r   )r   catr1   r   r   r#   W   s     z%Spider.homeContent.<locals>.<genexpr>)z/dy/z/ks/z/douyu/z/hy/z/hj/z/tt/z/wh/z/asmr/z/xb/z/xsp/z/rdgz/r   )	type_nametype_idTz.category-list az .slide-toggle + .category-list az.menu .category-list ar;   u   热点关注z/category/rdgz/u   抖音z/category/dy/u   快手z/category/ks/u   斗鱼z/category/douyu/u   虎牙z/category/hy/u   花椒z/category/hj/u   推特z/category/tt/u   网红z/category/wh/ZASMRz/category/asmr/u   X播z/category/xb/u	   小视频z/category/xsp/r/   $#index article a, #archive article ar0   zhomeContent error: )requestsgetr   r   r   status_codegetpqtextitemsattrstrip
startswithlowerr&   appendsetaddgetlist	Exceptionr   )r   filterresponsedataresultclassesZnav_selectorsZfound_categoriesselectoritemnamer;   Zcategory_selectorsZunique_classesZseen_idsclser   r9   r   homeContent2   s    






	 


zSpider.homeContentc              
   C   s   zLt j| j| j| jdd}|jdkr.dg iW S | |j}d| |diW S  t	k
r } zt
d|  dg i W Y S d }~X Y nX d S )Nr+   r,   r.   r0   r<   zhomeVideoContent error: )r=   r>   r   r   r   r?   r@   rA   rJ   rK   r   )r   rM   rN   rU   r   r   r   homeVideoContent   s    

zSpider.homeVideoContentc              
   C   sd  z| dd}|r8|dkr8| j | d| d}n| j | d}td|  tj|| j| jdd}|jdkrtd|j  g |dd	d
dW S | 	|j
}| |d|}	|	s| |d|}	tdt|	 d | ||}
i }|	|d< ||d< |
|d< d	|d< d|d< |W S  tk
r^ } z&td|  g |dd	d
d W Y S d }~X Y nX d S )Nr   1u   分类页面URL: r+   r,   r.   u   分类页面请求失败:    Z   r   r0   page	pagecountlimittotal0#archive article a, #index article a, .post-card"article a, .post a, .entry-title a   找到 u
    个视频r0   r\   r]   r^   ?B r_   zcategoryContent error: lstriprstripr   r   r=   r>   r   r   r?   r@   rA   rJ   lendetect_page_countrK   r   tidpgrL   r   base_urlr"   rM   rN   videosr]   rO   rU   r   r   r   categoryContent   s4    
zSpider.categoryContentc              
   C   sd  z| dd}|r8|dkr8| j | d| d}n| j | d}td|  tj|| j| jdd}|jdkrtd|j  g |dd	d
dW S | 	|j
}| |d|}	|	s| |d|}	tdt|	 d | ||}
i }|	|d< ||d< |
|d< d	|d< d|d< |W S  tk
r^ } z&td|  g |dd	d
d W Y S d}~X Y nX dS )u   标签页面内容r   rX   u   标签页面URL: r+   r,   r.   u   标签页面请求失败: rY   rZ   r   r[   r`   ra   rb   u    个标签相关视频r0   r\   r]   r^   rc   r_   ztagContent error: Nrd   ri   r   r   r   
tagContent   s4    
zSpider.tagContentc                 C   s"  d}g }ddddg}|D ]}||  D ]~}|dp8d}|  }	|rtd|d	}
|
rt|
d
}||kr|	| |	r(|	
 r(t|	}||kr(|	| q(q|rt|}td|  |S ddddg}|D ]}||rtd  dS qt|ddk rtd t|S td dS )u   改进的页数检测方法i z.page-navigator az.pagination az.pages az.page-numbers ar1   r   z	/(\d+)/?$r   rY   u#   从分页器检测到最大页码: z.page-navigator .nextz.pagination .nextz
.next-pageu   a:contains("下一页")u-   检测到下一页按钮，允许继续翻页z,#archive article, #index article, .post-card   u-   当前页内容较少，可能没有下一页u   使用默认页数: 99999)rB   rC   rA   rD   researchrf   intgrouprG   isdigitmaxr   rg   )r   rN   Zcurrent_pager]   Zpage_numbersZpage_selectorsrQ   Z	page_linkr1   rA   matchpage_nummax_pageZnext_selectorsr   r   r   rh      sN    
zSpider.detect_page_countc              
      sJ  z|d  ds$| j |d  n|d }tj|| j| jdd}|jdkrdddd| d	giW S | |j}d
di}ddddg}|D ]$}||}|r| 	 |d<  qqd|krd|d< zg }	|dr<|d
 D ]`}
|
 }|
d}|r|r| | jr|| jd}|	dt||d d | d  q|	rLd|	n|d p`|d |d< W n   |d |d< Y nX zg }t }|drt|d
 ddD ]\}}
|
d}|rzt|}|di d d}|rhd!| }d"}||kr d!| d#| }|d7 }q|| | d$| d%|  td$| d%|  || d&|  W n   Y qY nX q|sNd'dd(d)d*g}|D ]}t||
 ddD ]\}}|d+p|dpd  rt fd,d-d.D rd!| }d"}||kr(d!| d#| }|d7 }q|| || d&   qq|r| d/t| d0 td/t| d0 d1||d2< nd3| |d2< W n@ tk
r } z td4|  d3| |d2< W 5 d }~X Y nX d|giW S  tk
rD } z:td5|  ddd6|r |d nd d	gi W Y S d }~X Y nX d S )7Nr   r3   r+   r,   r.   r0   u   今日看料u   页面加载失败$)vod_play_fromvod_play_urlrz   z.post-titlezh1.entry-titleh1.post-card-titlevod_nameu   今日看料视频z.tags .keywords ar1   r   z[a=cr:)idrS   z/]z[/a] z.post-contentvod_contentz.dplayerrY   )r6   zdata-configvideor"   u   视频   _u   解析到视频: z -> $zvideo sourceziframe[src*="video"]za[href*=".m3u8"]za[href*=".mp4"]srcc                 3   s   | ]}| kV  qd S r)   r   r   r   r   r   r#   |  s     z'Spider.detailContent.<locals>.<genexpr>)r$   r%   r   u   拼装播放列表，共u   个r2   r{   u   正片$u   视频解析错误: zdetailContent error: u   详情页加载失败$)rE   r   r=   r>   r   r   r?   r@   rA   rD   rB   rC   replacerG   r   dumpsjoinrH   	enumerater   rI   r   r   r&   rg   rK   )r   idsr"   rM   rN   vodZtitle_selectorsrQ   
title_elemclistktitler1   plistZ
used_namescZconfig_attrconfig	video_urlrS   countZvideo_selectorselemrU   r   r   r   detailContent6  s    (


(.








 zSpider.detailContentrX   c              
   C   s.  zt |}|dkr(| j d| d| n| j d| d}tj|| j| jdd}|jdkr|dkrx| j d| d| n| j d| d}tj|| j| jdd}|jdkrg |dW S | |j}| 	|d	}| 
||}	|||	d
W S  tk
r( }
 z td|
  g |d W Y S d }
~
X Y nX d S )NrX   /tag/r   r+   r,   r.   z/search/r0   r\   r`   )r0   r\   r]   zsearchContent error: )r   r   r=   r>   r   r   r?   r@   rA   rJ   rh   rK   r   )r   keyquickrk   encoded_keyr"   rM   rN   rm   r]   rU   r   r   r   searchContent  s     0
0
zSpider.searchContentc                 C   sb  z| j  d}tj|| j| jdd}|jdkr<g |dW S | |j}g }|d D ]z}| 	 }|
dptd}|rX|rXd	|krX|d
krX|| j d}	|	dsd|	 }	||	d| ddddddd qXtdt| d i }
||
d< ||
d< d|
d< d|
d< t||
d< |
W S  tk
r\ } z td|  g |d W Y S d}~X Y nX dS )u   获取标签页面内容z
/tags.htmlr+   r,   r.   r   za[href*="/tag/"]r1   r   r   u   全部标签r   u   🏷️ u   标签tagrectHzG?typeratiovod_idr~   vod_picvod_remarksvod_tagstylerb   u
    个标签r0   r\   rY   r]   i  r^   r_   zgetTagsContent error: N)r   r=   r>   r   r   r?   r@   rA   rB   rD   rC   r   rE   rG   r   rg   rK   )r   rk   r"   rM   rN   r4   Ztag_elemtag_nameZtag_hrefZtag_idrO   rU   r   r   r   getTagsContent  sB    


	zSpider.getTagsContentc                 C   s`   |}d}|  |r(d|kr$| |}d}| d| d|  td| d|  ||| jdS )NrY   r$   r   u   播放请求: parse=z, url=)parser"   header)r'   proxyr   r   r   )r   flagr   vipFlagsr"   pr   r   r   playerContent  s    

zSpider.playerContentc              
   C   s  z| ddkr| |d }|dsR|drB| j | }n| j d| }tj || j| jdd}d|j d	d
|jgW S | ddkr| |d W S | 	|d W S W nN t
k
 r } z.td|  dddt|  g W Y S d }~X Y nX d S )Nr   imgr"   zhttp://zhttps://r   
   r,   r.   Content-Typez
image/jpegm3u8zlocalProxy error:   
text/plainzProxy error: )r>   d64rE   r   r=   r   r   contentm3ProxytsProxyrK   r   strencode)r   paramimg_urlresrU   r   r   r   
localProxy  s    

zSpider.localProxyr   c                 C   s4   |r,t | jr,|   d| | d| S |S d S )N&url=z&type=)rg   r   getProxyUrle64)r   rN   r   r   r   r   r     s    zSpider.proxyc              
   C   s  zz|  |}tj|| j| jdd}|jd}|jdrb|jd }tj|| j| jdjd}| d}|d |	d }t
|}|jd |j }d	}t|D ]\}	}
|rd
|
krd}t||
}|rt|d| |dd d|
||	< d}qd|
krd|
krB|
ddk r"|n|}||
dr8dnd |
 }
| |
|
dd dd ||	< qd|}dd|gW S  tk
r } z.td|  dddt|  g W Y S d }~X Y nX d S )NF)r   r   allow_redirectsutf-8Location)r   r   
r   z://TURIzURI="([^"]*)"zURI="rY   Zmkey"z#EXTr3   r   r   .?r   r.   zapplication/vnd.apple.mpegurzm3Proxy error: r   r   zm3u8 proxy error: )r   r=   r>   r   r   r   decoderD   splitrfindr   schemenetlocr   rq   rr   subr   rt   r   rE   r   rK   r   r   r   )r   r"   ZydatarN   linesZlast_r
parsed_urldurlZiskeyindexstringpatternrw   domainrU   r   r   r   r     s<    

(
&
zSpider.m3Proxyc              
   C   s   z8|  |}tj|| j| jdd}d|jdd|jgW S  tk
r } z.td|  ddd	t| 	 g W Y S d }~X Y nX d S )
NT)r   r   streamr.   r   z
video/mp2tztsProxy error: r   r   zts proxy error: )
r   r=   r>   r   r   r   rK   r   r   r   )r   r"   rN   rU   r   r   r   r   %  s    
zSpider.tsProxyc              
   C   s^   z| d}t|}|dW S  tk
rX } ztdt|  W Y dS d }~X Y nX d S )Nr   u   Base64编码错误: r   )r   r   r   rK   r   r   )r   rA   
text_bytesencoded_bytesrU   r   r   r   r   .  s    
z
Spider.e64c              
   C   s^   z| d}t|}|dW S  tk
rX } ztdt|  W Y dS d }~X Y nX d S )Nr   u   Base64解码错误: r   )r   r   r   rK   r   r   )r   encoded_textr   decoded_bytesrU   r   r   r   r   7  s    
z
Spider.d64c                 C   s   ddddg}|D ]}zlt j|| j| jdd}|jdkr~| |j}|d}t|d	kr~| d
|  t	d
|  |W   S W q t
k
r } zW Y qW 5 d}~X Y qX q| d|d	   t	d|d	   |d	 S )z(Get working host from known dynamic URLszhttps://kanliao25.com//zhttps://kanliao7.org/zhttps://kanliao7.net/zhttps://kanliao14.com/r   r,   r.   r<   r   u   选用可用站点: Nu#   未检测到可用站点，回退: )r=   r>   r   r   r?   r@   rA   rg   r   r   rK   )r   Zdynamic_urlsr"   rM   rN   ZarticlesrU   r   r   r   r   @  s(    
zSpider.get_working_hostc           	      C   s   g }|  D ]}|d}|d pD|d pD|d pD| }|d p\|d }| |rxtd|  q|r|r| r|ds|d	r|}qd	| }n|}|||d
d | 	||r| ndddddd q|S )Nr1   h2r}   z.entry-titlezspan[itemprop="datePublished"]z..post-meta, .entry-meta, time, .post-card-infou   过滤广告: r3   r   r   r   r   r   r   r   r   )
rB   rC   rA   is_advertisementr   rD   rE   rG   r   get_article_img)	r   rN   rj   rm   r   abr   r   r   r   r   rJ   Z  s.    
,



zSpider.getlistc                    s   | d}| D ]}d| kr dS q|d pD|d pDddddd	d
g}tfdd|D rndS |dpzd d krt fdddD rddg}t fdd|D rdS dS )u0   判断是否为广告（包含热搜HOT标志）z.wrapsu	   热搜HOTTr   r}   r   u   手机链接u	   DNS设置u	   修改DNSu
   WIFI设置c                 3   s   | ]}| kV  qd S r)   r   )r   keyword)r   r   r   r#     s     z*Spider.is_advertisement.<locals>.<genexpr>r   zbackground:c                 3   s   | ]}| kV  qd S r)   r   r   Zgradientr   r   r   r#     s     )z-webkit-linear-gradientzlinear-gradientz#ec008c,#fc6767z#ffe259,#ffa751c                 3   s   | ]}| kV  qd S r)   r   r   r   r   r   r#     s     F)findrB   rA   r&   rC   )r   article_elemZhot_elementsr   Zad_keywordsZad_gradientsr   )r   r   r   r   z  s    
zSpider.is_advertisementc                 C   s  |d  }|rxtd|}|rx|d}|ds^|drN| j | }n| j d| }|   d| | dS |d}|r|	d	pd
}td|}|r|d}|r|ds|ds|dr| j | }n| j d| }|   d| | dS |d}	|	r|		d}
|
r|
dsj|
drZ| j |
 }
n| j d|
 }
|   d| |
 dS |		d}|r|ds|dr| j | }n| j d| }|   d| | dS d
S )u3   从文章元素中提取图片，多种方式尝试scriptzloadBannerDirect\('([^']+)'rY   r   r   r   z	&type=imgz.blog-backgroundr   r   z2background-image:\s*url\(["\']?([^"\'\)]+)["\']?\)zdata:r   zdata-srcr   )
rA   rq   rr   rt   rE   r   r   r   r   rC   )r   r   script_textrw   r"   Zbg_elemr   Zbg_matchr   img_elemZdata_srcr   r   r   r   r     sN    









zSpider.get_article_imgc              
   C   sR   z
t |W S  tk
rL } z$tt|  t |d W Y S d }~X Y nX d S )Nr   )pqrK   r   r   r   )r   rN   rU   r   r   r   r@     s
    
zSpider.getpqN)r   )rX   )rX   )r   )r   )__name__
__module____qualname__r   r   r'   r(   r*   rV   rW   rn   ro   rh   r   r   r   r   r   r   r   r   r   r   r   rJ   r   r   r@   r   r   r   r   r      s2   
j()>`

/
 			
 5r   )r   randomrq   sys	threadingr7   base64r   r   urllib.parser   r   r=   pyqueryr   r   pathrG   base.spiderr   r   r   r   r   <module>   s   