Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/update-ky-youtube.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ jobs:
working-directory: packages/crawling
run: pnpm install

- name: Install puppeteer dependencies
working-directory: packages/crawling
run: |
sudo apt-get update
sudo apt-get install -y chromium

- name: Create .env file
working-directory: packages/crawling
run: |
Expand Down
2 changes: 1 addition & 1 deletion packages/crawling/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"scripts": {
"ky-open": "tsx src/findKYByOpen.ts",
"ky-youtube": "tsx src/crawling/crawlYoutube.ts",
"ky-valid": "tsx src/crawling/crawlYoutubeValid.ts",
"ky-youtube-ubuntu": "tsx src/crawling/crawlYoutubeUbuntu.ts",
"ky-update": "pnpm run ky-youtube & pnpm run ky-valid",
"trans": "tsx src/postTransDictionary.ts",
"recent-tj": "tsx src/crawling/crawlRecentTJ.ts",
Expand Down
14 changes: 3 additions & 11 deletions packages/crawling/src/crawling/crawlYoutube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,6 @@ import { isValidKYExistNumber } from './isValidKYExistNumber';
// action 우분투 환경에서의 호환을 위해 추가
const browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage', // 리눅스 메모리 제한 대응
'--disable-gpu',
'--disable-infobars',
'--single-process',
'--window-size=1920,1080',
],
});

const page = await browser.newPage();
Expand Down Expand Up @@ -75,6 +66,7 @@ const data = await getSongsKyNullDB();
const failedSongs = loadCrawlYoutubeFailedKYSongs();

console.log('getSongsKyNullDB : ', data.length);
console.log(failedSongs.size);
let index = 0;

for (const song of data) {
Expand All @@ -86,11 +78,10 @@ for (const song of data) {
const query = song.title + '-' + song.artist;

if (failedSongs.has(query)) {
console.log('failedSongs has : ', query);
continue;
}

console.log(song.title, ' - ', song.artist);

let resultKyNum = null;
try {
resultKyNum = await scrapeSongNumber(query);
Expand All @@ -115,6 +106,7 @@ for (const song of data) {
} else saveCrawlYoutubeFailedKYSongs(song.title, song.artist);

index++;
console.log(query);
console.log('scrapeSongNumber : ', index);
}

Expand Down
122 changes: 122 additions & 0 deletions packages/crawling/src/crawling/crawlYoutubeUbuntu.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import * as cheerio from 'cheerio';
import puppeteer from 'puppeteer';

import { getSongsKyNullDB } from '@/supabase/getDB';
import { updateSongsKyDB } from '@/supabase/updateDB';
import { Song } from '@/types';
import {
loadCrawlYoutubeFailedKYSongs,
saveCrawlYoutubeFailedKYSongs,
updateDataLog,
} from '@/utils/logData';

import { isValidKYExistNumber } from './isValidKYExistNumber';

// youtube에서 KY 노래방 번호 크롤링
// crawlYoutubeValid에서 진행하는 실제 사이트 검증도 포함

// action 우분투 환경에서의 호환을 위해 추가
const browser = await puppeteer.launch({
headless: true,
executablePath: '/usr/bin/chromium-browser', // 또는 "/usr/bin/chromium"
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage', // 리눅스 메모리 제한 대응
'--disable-gpu',
'--disable-infobars',
'--single-process',
'--window-size=1920,1080',
],
});

const page = await browser.newPage();

const baseUrl = 'https://www.youtube.com/@KARAOKEKY/search';

const scrapeSongNumber = async (query: string) => {
const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`;

// page.goto의 waitUntil 문제였음!
await page.goto(searchUrl, {
waitUntil: 'networkidle2',
timeout: 0,
});

const html = await page.content();
const $ = cheerio.load(html);

// id contents 의 첫번째 ytd-item-section-renderer 찾기
// const firstItem = $("#contents ytd-item-section-renderer").first();

const firstItem = $('ytd-video-renderer').first();

// yt-formatted-string 찾기
const title = firstItem.find('yt-formatted-string').first().text().trim();

const karaokeNumber = extractKaraokeNumber(title);

return karaokeNumber;
};

const extractKaraokeNumber = (title: string) => {
// KY. 찾고 ) 가 올때까지 찾기
const matchResult = title.match(/KY\.\s*(\d{2,5})\)/);
const karaokeNumber = matchResult ? matchResult[1] : null;
return karaokeNumber;
};

const updateData = async (data: Song) => {
const result = await updateSongsKyDB(data);
updateDataLog(result.success, 'crawlYoutubeSuccess.txt');
updateDataLog(result.failed, 'crawlYoutubeFailed.txt');
};

const data = await getSongsKyNullDB();
const failedSongs = loadCrawlYoutubeFailedKYSongs();

console.log('getSongsKyNullDB : ', data.length);
let index = 0;

for (const song of data) {
// 테스트를 위해 100회 반복 후 종료시키기
if (index >= 100) {
break;
}

const query = song.title + '-' + song.artist;

if (failedSongs.has(query)) {
continue;
}

console.log(song.title, ' - ', song.artist);

let resultKyNum = null;
try {
resultKyNum = await scrapeSongNumber(query);
} catch (error) {
continue;
}

if (resultKyNum) {
let isValid = true;
try {
isValid = await isValidKYExistNumber(page, resultKyNum, song.title, song.artist);
} catch (error) {
continue;
}

if (!isValid) {
saveCrawlYoutubeFailedKYSongs(song.title, song.artist);
continue;
} else {
await updateData({ ...song, num_ky: resultKyNum });
}
} else saveCrawlYoutubeFailedKYSongs(song.title, song.artist);

index++;
console.log('scrapeSongNumber : ', index);
}

browser.close();