mirror of
https://github.com/soapingtime/diyhrt.git
synced 2026-03-22 23:26:24 +00:00
update hrtcafe.net + change domain in the useragents
This commit is contained in:
parent
7d5a3caf73
commit
8811d9c0fe
9 changed files with 5 additions and 93 deletions
2
.github/workflows/diyhrt.market.yml
vendored
2
.github/workflows/diyhrt.market.yml
vendored
|
|
@ -24,7 +24,7 @@ jobs:
|
||||||
- name: download diyhrt.market
|
- name: download diyhrt.market
|
||||||
run: |
|
run: |
|
||||||
rm -rf diyhrt.market cdn.diyhrt.market
|
rm -rf diyhrt.market cdn.diyhrt.market
|
||||||
wget -mkpKEH --limit-rate 10m --random-wait --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --domains=diyhrt.market,cdn.diyhrt.market https://diyhrt.market || true
|
wget -mkpKEH --limit-rate 10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --domains=diyhrt.market,cdn.diyhrt.market https://diyhrt.market || true
|
||||||
find . -name "*.orig" -type f -delete
|
find . -name "*.orig" -type f -delete
|
||||||
- name: commit changes
|
- name: commit changes
|
||||||
if: ${{ success() }}
|
if: ${{ success() }}
|
||||||
|
|
|
||||||
2
.github/workflows/diyhrt.wiki.yml
vendored
2
.github/workflows/diyhrt.wiki.yml
vendored
|
|
@ -24,7 +24,7 @@ jobs:
|
||||||
- name: download diyhrt.wiki
|
- name: download diyhrt.wiki
|
||||||
run: |
|
run: |
|
||||||
rm -rf diyhrt.wiki
|
rm -rf diyhrt.wiki
|
||||||
wget -mkxKE -e robots=off --limit-rate=10m --random-wait --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --reject '*age-check*' https://diyhrt.wiki https://diyhrt.wiki/nav.html || true # downloads nav.html because it doesn't crawl properly without it
|
wget -mkxKE -e robots=off --limit-rate=10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --reject '*age-check*' https://diyhrt.wiki https://diyhrt.wiki/nav.html || true # downloads nav.html because it doesn't crawl properly without it
|
||||||
cd diyhrt.wiki && rm *.orig && cd ..
|
cd diyhrt.wiki && rm *.orig && cd ..
|
||||||
- name: setup python
|
- name: setup python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
|
|
|
||||||
2
.github/workflows/hrt.coffee.yml
vendored
2
.github/workflows/hrt.coffee.yml
vendored
|
|
@ -28,7 +28,7 @@ jobs:
|
||||||
- name: Download hrt.coffee
|
- name: Download hrt.coffee
|
||||||
run: |
|
run: |
|
||||||
rm -rf hrt.coffee
|
rm -rf hrt.coffee
|
||||||
export USER_AGENT="Mosaic/9.0 (Commodore AmigaOS 4.0) AmiWeb/4.1.33 (KHTML, like Gecko) Version/22.1 Mozilla/5.0. Contact: https://boytw.ink/crawl_contact.txt"
|
export USER_AGENT="Mosaic/9.0 (Commodore AmigaOS 4.0) AmiWeb/4.1.33 (KHTML, like Gecko) Version/22.1 Mozilla/5.0. github.com/soapingtime/diyhrt"
|
||||||
wget --adjust-extension --convert-links -e robots=off --mirror --page-requisites --waitretry 5 --timeout 60 --tries 5 --wait 1 -U "$USER_AGENT" https://hrt.coffee
|
wget --adjust-extension --convert-links -e robots=off --mirror --page-requisites --waitretry 5 --timeout 60 --tries 5 --wait 1 -U "$USER_AGENT" https://hrt.coffee
|
||||||
- name: Commit changes
|
- name: Commit changes
|
||||||
if: ${{ success() }}
|
if: ${{ success() }}
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
rm -rf diyhrt.cafe
|
rm -rf diyhrt.cafe
|
||||||
rm -rf hrtcafe.net
|
rm -rf hrtcafe.net
|
||||||
wget --convert-links --page-requisites --span-hosts --no-parent --reject 'rocket-loader.min.js' --reject '*UserLogin*' --reject '*visualEditor*' --reject '*CreateAccount*' --reject '*load.php*' --wait 3 --limit-rate 4m --continue --random-wait --adjust-extension -U 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20070802 SeaMonkey/1.1.4 Contact: hrt.boytw.ink/crawl_contact.txt' -e robots=off -i scripts/cafe.txt || true
|
wget -mkpKEH --limit-rate 10m --random-wait --user-agent="github.com/soapingtime/diyhrt" --domains=hrtcafe.net https://hrtcafe.net || true
|
||||||
- name: commit changes
|
- name: commit changes
|
||||||
if: ${{ success() }}
|
if: ${{ success() }}
|
||||||
uses: EndBug/add-and-commit@v9
|
uses: EndBug/add-and-commit@v9
|
||||||
2
.github/workflows/transfemscience.org.yml
vendored
2
.github/workflows/transfemscience.org.yml
vendored
|
|
@ -24,7 +24,7 @@ jobs:
|
||||||
- name: download transfemscience
|
- name: download transfemscience
|
||||||
run: |
|
run: |
|
||||||
rm -rf transfemscience.org
|
rm -rf transfemscience.org
|
||||||
wget -mkpKEH --limit-rate 10m --random-wait -e robots=off --user-agent="Contact: hrt.boytw.ink/crawl_contact.txt" --domains=transfemscience.org https://transfemscience.org || true
|
wget -mkpKEH --limit-rate 10m --random-wait -e robots=off --user-agent="github.com/soapingtime/diyhrt" --domains=transfemscience.org https://transfemscience.org || true
|
||||||
find . -name "*.orig" -type f -delete
|
find . -name "*.orig" -type f -delete
|
||||||
- name: commit changes
|
- name: commit changes
|
||||||
if: ${{ success() }}
|
if: ${{ success() }}
|
||||||
|
|
|
||||||
32
.github/workflows/update_cafe.yml
vendored
32
.github/workflows/update_cafe.yml
vendored
|
|
@ -1,32 +0,0 @@
|
||||||
name: Update DIYHRT.cafe links
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
schedule:
|
|
||||||
- cron: "30 23 1 * *" # every 1st of month
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
update:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
token: ${{ secrets.PAT_TOKEN }}
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: pypy-3.7-v7.3.3
|
|
||||||
- name: Update links
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
rm scripts/cafe.txt
|
|
||||||
pip install beautifulsoup4 bs4
|
|
||||||
python scripts/getlinks.py 'https://hrtcafe.net/index.php/Special:AllPages' > scripts/cafe.txt
|
|
||||||
cd scripts && python formatlinks.py cafe.txt
|
|
||||||
- name: Commit changes
|
|
||||||
if: ${{ success() }}
|
|
||||||
uses: EndBug/add-and-commit@v9
|
|
||||||
with:
|
|
||||||
add: "scripts/cafe.txt"
|
|
||||||
pull: "--rebase --autostash ."
|
|
||||||
message: "[automated] update diyhrt.cafe list"
|
|
||||||
default_author: github_actions
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
import sys
|
|
||||||
|
|
||||||
def remove_trailing_spaces(line):
|
|
||||||
return line.rstrip()
|
|
||||||
|
|
||||||
def remove_duplicate_lines(file_path):
|
|
||||||
lines_seen = set()
|
|
||||||
output_lines = []
|
|
||||||
|
|
||||||
with open(file_path, 'r') as file:
|
|
||||||
for line in file:
|
|
||||||
cleaned_line = remove_trailing_spaces(line)
|
|
||||||
|
|
||||||
if cleaned_line.startswith("https://diyhrt.cafe") and cleaned_line not in lines_seen:
|
|
||||||
lines_seen.add(cleaned_line)
|
|
||||||
output_lines.append(line)
|
|
||||||
|
|
||||||
with open(file_path, 'w') as file:
|
|
||||||
file.writelines(output_lines)
|
|
||||||
|
|
||||||
# Usage example
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("Please provide the file path as an argument.")
|
|
||||||
else:
|
|
||||||
file_path = sys.argv[1]
|
|
||||||
remove_duplicate_lines(file_path)
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.request import Request, urlopen
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("Please provide a URL.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
url = sys.argv[1]
|
|
||||||
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
||||||
|
|
||||||
req = Request(url, headers=headers)
|
|
||||||
|
|
||||||
html_page = urlopen(req)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html_page, "html.parser")
|
|
||||||
|
|
||||||
links = []
|
|
||||||
for link in soup.findAll('a'):
|
|
||||||
href = link.get('href')
|
|
||||||
if href:
|
|
||||||
full_url = urljoin(url, href)
|
|
||||||
links.append(full_url)
|
|
||||||
|
|
||||||
formatted_links = '\n'.join(links)
|
|
||||||
print(formatted_links)
|
|
||||||
Loading…
Add table
Reference in a new issue