justalphie commited on
Commit
7c9b115
·
1 Parent(s): e813c1c

scrape descriptions

Browse files
scraping_activities.py CHANGED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import playwright
2
+ import playwright.sync_api
3
+ from playwright.sync_api import sync_playwright
4
+ import json
5
+ from tqdm import tqdm
6
+
7
+ with open ("sections.json", "r") as f:
8
+ data = json.load(f)
9
+
10
+ with sync_playwright() as p:
11
+ browser = p.chromium.launch(headless=True)
12
+ page = browser.new_page()
13
+
14
+ def fetch_description(url, selector):
15
+
16
+ page.goto(url)
17
+ page.wait_for_timeout(3000)
18
+ description = page.query_selector(selector).text_content()
19
+ return description
20
+
21
+ selector_description = "div.s-rte"
22
+
23
+ def check_if_url(url,selector):
24
+ if url is not None:
25
+ return fetch_description(url, selector)
26
+ else:
27
+ pass
28
+
29
+ for item in tqdm(data):
30
+ for i in range(len(item["activities"])):
31
+ description = check_if_url((item["activities"][i]["url"]), selector_description)
32
+ item["activities"][i]["description"] = description
33
+ print(item["activities"][i]["description"])
34
+
35
+ with open ("sections_with_details.json", "w") as f:
36
+ json.dump(data, f)
scraping_playwright.py DELETED
@@ -1,112 +0,0 @@
1
- import playwright
2
- from playwright.sync_api import sync_playwright
3
- import json
4
- import playwright.sync_api
5
-
6
-
7
- url = "https://techorama.be/agenda/"
8
- selector_links = "div.m-subject__container-inner.a-box-simple__inner-2 > a"
9
-
10
- with sync_playwright() as p:
11
- browser = p.chromium.launch(headless=True)
12
- page = browser.new_page()
13
- page.goto(url)
14
- page.wait_for_timeout(3000)
15
- links_elements = page.query_selector_all(selector_links)
16
- links = [element.get_property("href").json_value() for element in links_elements] # .get_attribute("href")
17
- filepath = "links_tue.csv"
18
- #with open(filepath, "w") as f:
19
- #f.write("\n".join(links))
20
- page.locator("button:has-text(\"wednesday\")").click()
21
- links_elements = page.query_selector_all(selector_links)
22
- links = [element.get_property("href").json_value() for element in links_elements] # .get_attribute("href")
23
- filepath = "links_wed.csv"
24
- #with open(filepath, "w") as f:
25
- #f.write("\n".join(links))
26
-
27
- browser.close()
28
-
29
- #TODO
30
- #1 save the structure [{"time":"...", "activities":[{"name_of_activity":"...", "speaker_name": "...", general_topic": "topic name", "room_number":"..". "url":"...", "date":"...", "time":"time"}]}
31
- #go to each link, and fetch the description and add it to the dictionary
32
- #save a csv
33
-
34
-
35
- data_technorama = [{"time":"", "activities":[{"name_of_activity":"", "speaker_name": "...", "general_topic": "topic name", "room_number":"..", "url":"...", "date":"...", "time":"time"}]}]
36
- selector_schedule_section = "article.o-schedule__section"
37
- selector_time = "h4.o-schedule__section-title"
38
- selector_activity = "article.m-subject"
39
- selector_activity_title = "h3.m-subject__title"
40
- selector_activity_speaker_name = "p.m-subject__name"
41
- selector_activity_room_number = "p.m-subject__room"
42
- selector_activity_track_label = "p.m-subject__track-label"
43
- selector_url = "div.m-subject__container-inner.a-box-simple__inner-2 > a"
44
-
45
- def get_text_content_of(element: playwright.sync_api.ElementHandle):
46
- if element is None: return None
47
- return element.text_content()
48
- def get_url_of(element: playwright.sync_api.ElementHandle):
49
- if element is None: return None
50
- return element.get_property("href").json_value()
51
-
52
- with sync_playwright() as p:
53
- browser = p.chromium.launch(headless=True)
54
- page = browser.new_page()
55
- page.goto(url)
56
- page.wait_for_timeout(3000)
57
- schedule_section_elems = page.query_selector_all(selector_schedule_section)
58
-
59
- sections = []
60
- for schedule_section_elem in schedule_section_elems:
61
- section_time = schedule_section_elem.query_selector(selector_time).text_content()
62
- section_activities = schedule_section_elem.query_selector_all(selector_activity)
63
- section_activities_list = []
64
- for section_activity in section_activities:
65
- activity_name = get_text_content_of(section_activity.query_selector(selector_activity_title))
66
- speaker_name = get_text_content_of(section_activity.query_selector(selector_activity_speaker_name))
67
- room_number = get_text_content_of(section_activity.query_selector(selector_activity_room_number))
68
- track_label = get_text_content_of(section_activity.query_selector(selector_activity_track_label))
69
- url_activity = get_url_of(section_activity.query_selector(selector_url))
70
- section_activities_list.append({"name_of_activity": activity_name,
71
- "speaker_name": speaker_name,
72
- "room_number":room_number,
73
- "track_label": track_label,
74
- "url": url_activity,
75
- "day_of_week":"Tuesday",
76
- "date": "2024/05/07",
77
- "time":section_time})
78
- sections.append({
79
- "time": section_time,
80
- "activities": section_activities_list
81
- })
82
-
83
- page.locator("button:has-text(\"wednesday\")").click()
84
-
85
- schedule_section_elems = page.query_selector_all(selector_schedule_section)
86
- for schedule_section_elem in schedule_section_elems:
87
- section_time = schedule_section_elem.query_selector(selector_time).text_content()
88
- section_activities = schedule_section_elem.query_selector_all(selector_activity)
89
- section_activities_list = []
90
- for section_activity in section_activities:
91
- activity_name = get_text_content_of(section_activity.query_selector(selector_activity_title))
92
- speaker_name = get_text_content_of(section_activity.query_selector(selector_activity_speaker_name))
93
- room_number = get_text_content_of(section_activity.query_selector(selector_activity_room_number))
94
- track_label = get_text_content_of(section_activity.query_selector(selector_activity_track_label))
95
- url_activity = get_url_of(section_activity.query_selector(selector_url))
96
- section_activities_list.append({"name_of_activity": activity_name,
97
- "speaker_name": speaker_name,
98
- "room_number":room_number,
99
- "track_label": track_label,
100
- "url": url_activity,
101
- "day_of_week":"Wednesday",
102
- "date": "2024/05/08",
103
- "time":section_time})
104
- sections.append({
105
- "time": section_time,
106
- "activities": section_activities_list
107
- })
108
- # Save sections to JSON file
109
- with open("sections.json", "w") as f:
110
- json.dump(sections, f)
111
- browser.close()
112
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scraping_playwright_steps.ipynb DELETED
File without changes