From 8811d9c0feec911bc3f169363984678278a266e1 Mon Sep 17 00:00:00 2001 From: soap Date: Wed, 6 Mar 2024 13:09:21 -0600 Subject: [PATCH] update hrtcafe.net + change domain in the useragents --- .github/workflows/diyhrt.market.yml | 2 +- .github/workflows/diyhrt.wiki.yml | 2 +- .github/workflows/hrt.coffee.yml | 2 +- .../{diyhrt.cafe.yml => hrtcafe.net.yml} | 2 +- .github/workflows/transfemscience.org.yml | 2 +- .github/workflows/update_cafe.yml | 32 ------------------- scripts/cafe.txt | 0 scripts/formatlinks.py | 27 ---------------- scripts/getlinks.py | 29 ----------------- 9 files changed, 5 insertions(+), 93 deletions(-) rename .github/workflows/{diyhrt.cafe.yml => hrtcafe.net.yml} (69%) delete mode 100644 .github/workflows/update_cafe.yml delete mode 100644 scripts/cafe.txt delete mode 100644 scripts/formatlinks.py delete mode 100644 scripts/getlinks.py diff --git a/.github/workflows/diyhrt.market.yml b/.github/workflows/diyhrt.market.yml index 7c6c346b..fdb2e725 100644 --- a/.github/workflows/diyhrt.market.yml +++ b/.github/workflows/diyhrt.market.yml @@ -24,7 +24,7 @@ jobs: - name: download diyhrt.market run: | rm -rf diyhrt.market cdn.diyhrt.market - wget -mkpKEH --limit-rate 10m --random-wait --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --domains=diyhrt.market,cdn.diyhrt.market https://diyhrt.market || true + wget -mkpKEH --limit-rate 10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --domains=diyhrt.market,cdn.diyhrt.market https://diyhrt.market || true find . -name "*.orig" -type f -delete - name: commit changes if: ${{ success() }} diff --git a/.github/workflows/diyhrt.wiki.yml b/.github/workflows/diyhrt.wiki.yml index dcd2c35a..add4eb97 100644 --- a/.github/workflows/diyhrt.wiki.yml +++ b/.github/workflows/diyhrt.wiki.yml @@ -24,7 +24,7 @@ jobs: - name: download diyhrt.wiki run: | rm -rf diyhrt.wiki - wget -mkxKE -e robots=off --limit-rate=10m --random-wait --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --reject '*age-check*' https://diyhrt.wiki https://diyhrt.wiki/nav.html || true # downloads nav.html because it doesn't crawl properly without it + wget -mkxKE -e robots=off --limit-rate=10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --reject '*age-check*' https://diyhrt.wiki https://diyhrt.wiki/nav.html || true # downloads nav.html because it doesn't crawl properly without it cd diyhrt.wiki && rm *.orig && cd .. - name: setup python uses: actions/setup-python@v4 diff --git a/.github/workflows/hrt.coffee.yml b/.github/workflows/hrt.coffee.yml index bc37a3f7..a4fec02a 100644 --- a/.github/workflows/hrt.coffee.yml +++ b/.github/workflows/hrt.coffee.yml @@ -28,7 +28,7 @@ jobs: - name: Download hrt.coffee run: | rm -rf hrt.coffee - export USER_AGENT="Mosaic/9.0 (Commodore AmigaOS 4.0) AmiWeb/4.1.33 (KHTML, like Gecko) Version/22.1 Mozilla/5.0. Contact: https://boytw.ink/crawl_contact.txt" + export USER_AGENT="Mosaic/9.0 (Commodore AmigaOS 4.0) AmiWeb/4.1.33 (KHTML, like Gecko) Version/22.1 Mozilla/5.0. github.com/soapingtime/diyhrt" wget --adjust-extension --convert-links -e robots=off --mirror --page-requisites --waitretry 5 --timeout 60 --tries 5 --wait 1 -U "$USER_AGENT" https://hrt.coffee - name: Commit changes if: ${{ success() }} diff --git a/.github/workflows/diyhrt.cafe.yml b/.github/workflows/hrtcafe.net.yml similarity index 69% rename from .github/workflows/diyhrt.cafe.yml rename to .github/workflows/hrtcafe.net.yml index cd87ac3e..7c2d235f 100644 --- a/.github/workflows/diyhrt.cafe.yml +++ b/.github/workflows/hrtcafe.net.yml @@ -26,7 +26,7 @@ jobs: run: | rm -rf diyhrt.cafe rm -rf hrtcafe.net - wget --convert-links --page-requisites --span-hosts --no-parent --reject 'rocket-loader.min.js' --reject '*UserLogin*' --reject '*visualEditor*' --reject '*CreateAccount*' --reject '*load.php*' --wait 3 --limit-rate 4m --continue --random-wait --adjust-extension -U 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20070802 SeaMonkey/1.1.4 Contact: hrt.boytw.ink/crawl_contact.txt' -e robots=off -i scripts/cafe.txt || true + wget -mkpKEH --limit-rate 10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --domains=hrtcafe.net https://hrtcafe.net || true - name: commit changes if: ${{ success() }} uses: EndBug/add-and-commit@v9 diff --git a/.github/workflows/transfemscience.org.yml b/.github/workflows/transfemscience.org.yml index bd777364..551aa47a 100644 --- a/.github/workflows/transfemscience.org.yml +++ b/.github/workflows/transfemscience.org.yml @@ -24,7 +24,7 @@ jobs: - name: download transfemscience run: | rm -rf transfemscience.org - wget -mkpKEH --limit-rate 10m --random-wait -e robots=off --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --domains=transfemscience.org https://transfemscience.org || true + wget -mkpKEH --limit-rate 10m --random-wait -e robots=off --user-agent="github.com/soapingtime/diyhrt" --domains=transfemscience.org https://transfemscience.org || true find . -name "*.orig" -type f -delete - name: commit changes if: ${{ success() }} diff --git a/.github/workflows/update_cafe.yml b/.github/workflows/update_cafe.yml deleted file mode 100644 index 8b38cde3..00000000 --- a/.github/workflows/update_cafe.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Update DIYHRT.cafe links - -on: - workflow_dispatch: - schedule: - - cron: "30 23 1 * *" # every 1st of month - -jobs: - update: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.PAT_TOKEN }} - - uses: actions/setup-python@v4 - with: - python-version: pypy-3.7-v7.3.3 - - name: Update links - if: always() - run: | - rm scripts/cafe.txt - pip install beautifulsoup4 bs4 - python scripts/getlinks.py 'https://hrtcafe.net/index.php/Special:AllPages' > scripts/cafe.txt - cd scripts && python formatlinks.py cafe.txt - - name: Commit changes - if: ${{ success() }} - uses: EndBug/add-and-commit@v9 - with: - add: "scripts/cafe.txt" - pull: "--rebase --autostash ." - message: "[automated] update diyhrt.cafe list" - default_author: github_actions diff --git a/scripts/cafe.txt b/scripts/cafe.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/scripts/formatlinks.py b/scripts/formatlinks.py deleted file mode 100644 index ef00a6a2..00000000 --- a/scripts/formatlinks.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys - -def remove_trailing_spaces(line): - return line.rstrip() - -def remove_duplicate_lines(file_path): - lines_seen = set() - output_lines = [] - - with open(file_path, 'r') as file: - for line in file: - cleaned_line = remove_trailing_spaces(line) - - if cleaned_line.startswith("https://diyhrt.cafe") and cleaned_line not in lines_seen: - lines_seen.add(cleaned_line) - output_lines.append(line) - - with open(file_path, 'w') as file: - file.writelines(output_lines) - -# Usage example -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Please provide the file path as an argument.") - else: - file_path = sys.argv[1] - remove_duplicate_lines(file_path) diff --git a/scripts/getlinks.py b/scripts/getlinks.py deleted file mode 100644 index 0b9f9651..00000000 --- a/scripts/getlinks.py +++ /dev/null @@ -1,29 +0,0 @@ -from bs4 import BeautifulSoup -from urllib.request import Request, urlopen -import re -import sys -from urllib.parse import urljoin - -if len(sys.argv) < 2: - print("Please provide a URL.") - sys.exit(1) - -url = sys.argv[1] - -headers = {'User-Agent': 'Mozilla/5.0'} - -req = Request(url, headers=headers) - -html_page = urlopen(req) - -soup = BeautifulSoup(html_page, "html.parser") - -links = [] -for link in soup.findAll('a'): - href = link.get('href') - if href: - full_url = urljoin(url, href) - links.append(full_url) - -formatted_links = '\n'.join(links) -print(formatted_links)