dw-commentfest-scrape/scraper.py
2026-06-14 19:48:45 -04:00

75 lines
1.9 KiB
Python

headers = {'User-Agent':"hellodwitsagnesimdoingashortscrapeforacommentfestokaythankyou/0.1 +https://alien.holiday Ubuntu/24.04.3"}
#Server: Ubuntu/24.03.3
import requests
import os
import time
from datetime import datetime
from requests_html import HTMLSession
import json
session = HTMLSession()
time.sleep(5)
r = session.get("https://news.mourningdove.club/614.html?view=flat#comments")
comments = r.html.find('.comment-content')
#for comment in comments:
# print(comment.text)
webhook_url = "https://discordapp.com/api/webhooks/1515658607457206492/I3e0Fyh2FzW2rmW7SCfFNmy8pt1er1rLuaVpeDriELAFbDirJC-7HIP4AG4DfUVeVAiu"
comment_link = r.html.find(".link.commentpermalink a")
STATE_FILE = "last_run.json"
if os.path.exists(STATE_FILE):
try:
with open(STATE_FILE, "r") as f:
state = json.load(f)
except json.JSONDecodeError:
state = {}
else:
state = {}
comment_content = []
for c, l in zip(comments, comment_link):
comment_content.append({
"text": c.text,
"comment_link": l.attrs["href"]
})
last_seen_url = state.get("last_seen_url")
if not last_seen_url:
state["last_seen_url"] = comment_link[-1].attrs["href"]
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
print("Initialized!")
newcomments = []
found = False
for c in comment_content:
if c["comment_link"] == last_seen_url:
found = True
continue
if found:
newcomments.append(c)
print(c["comment_link"])
if len(newcomments) == 0:
requests.post(webhook_url, json={
"content": "```No new comments since last run```"
})
exit()
requests.post(webhook_url, json={
"content": f"```html\n<a href=\"{c['comment_link']}\">{c['text']}</a>\n```"
})
print("newcomments =", len(newcomments))
state["last_seen_url"] = comment_content[-1]["comment_link"]
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)