Note

video url을 통한 비디오 정보 수집 본문

etc/Crawling

video url을 통한 비디오 정보 수집

알 수 없는 사용자 2022. 4. 8. 23:53
728x90
video_results = {} 
cnt = 0
for video_id in tqdm(array_video_id): #Video_ID 목록
    if(cnt % 9 == 2):
        time.sleep(3)
    cnt += 1
    result = {} 
    video_url = "https://www.youtube.com"+video_id
    response = session.get(video_url,headers = headers) #URL 통신
    if(response.status_code == 429):
        print(response)
    soup = bs(response.text, "html.parser")
    
    try: # 예외 발생하면 가져오지 않음
        meta = soup.find_all("meta") 
        result['video_id'] = soup.find("meta", itemprop="videoId")['content']
        result['channel_id'] = soup.find("meta", itemprop="channelId")['content']
        result['title'] = soup.find("meta", property="og:title")['content']
        result['image'] = soup.find("meta", property="og:image")['content']
        result['genre'] = soup.find("meta", itemprop="genre")['content']
        result['published_date'] = soup.find("meta", itemprop="datePublished")['content']
        view_count = soup.find("meta", itemprop="interactionCount")
        if(view_count != None):
            result['views'] = soup.find("meta", itemprop="interactionCount")['content']
        else:
            result['views'] = 0
        result['duration'] =  soup.find("meta", itemprop="duration")['content']
        for tag in meta:
            if 'name' in tag.attrs.keys() and tag.attrs['name'].strip().lower() in ['description', 'keywords']:
                result[tag.attrs['name']] = tag.attrs['content']
        video_results[video_id] = result
    except:
        continue       
final_results = pd.DataFrame(video_results).T
insert_df = final_results[['video_id','channel_id','title','image','published_date','views','likes','duration','genre','ads_yn']]
insert_df.columns = ['video_id','channel_id','title','video_thumbnails_url','publishDate','views','likes','duration','genre','ads_yn']
Comments