summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--in/index.html1
-rw-r--r--in/smallunix.html2
-rw-r--r--in/tinkering.html3
-rwxr-xr-xqdsg.sh51
-rw-r--r--robots.txt67
5 files changed, 120 insertions, 4 deletions
diff --git a/in/index.html b/in/index.html
index cc8de7a..c3ea0b5 100644
--- a/in/index.html
+++ b/in/index.html
@@ -1,3 +1,4 @@
+TITLE: Index
<h2>Über mich</h2>
<p>
Hallo, ich bin Maite (sie/ihr). Ich bastel gerne mit Technik rum, mache
diff --git a/in/smallunix.html b/in/smallunix.html
index 5f9167a..475226b 100644
--- a/in/smallunix.html
+++ b/in/smallunix.html
@@ -43,7 +43,7 @@ haben.</p>
<h2>Quellcode</h2>
<p>Der Quellcode kann auf meiner cgit-Instanz
-(<a href='https://cgit.zeldakatze.de/smallUnix/'>Hier</a>) gefunden werden.
+(<a href="https://cgit.zeldakatze.de/smallUnix/">Hier</a>) gefunden werden.
Zum Kompilieren gibt es ein eigenes Toolchain, das zunächst kompiliert werden
muss. Anschließend kann das System recht leicht mit ./build.sh kompiliert werden
</p>
diff --git a/in/tinkering.html b/in/tinkering.html
index 567e514..5bc38bb 100644
--- a/in/tinkering.html
+++ b/in/tinkering.html
@@ -1,8 +1,9 @@
+TITLE: Basteleien
Ich habe derzeit einige Projekte an denen ich Arbeite, einige davon sind hier
aufgezählt.
-<a href='smallunix.html'><h2>smallUnix</h2></a>
+<a href="smallunix.html"><h2>smallUnix</h2></a>
Ein sich Unix-Ähnlich anfühlendes System, das nicht-kontinuierliche Adressräume
verwendet.
diff --git a/qdsg.sh b/qdsg.sh
index 28bbb07..41cd9b5 100755
--- a/qdsg.sh
+++ b/qdsg.sh
@@ -1,14 +1,61 @@
#!/bin/bash
+run_ffmpeg_if_not_exist() {
+ local vid_input = $1
+ local out_basename = $2
+ local vid_codec = $3
+
+ ffmpeg -i "$vid_input" -c:v "$vid_codec"
+}
+
+# generates multiple resolution for the given video files.
+# $1: file name
+# @return
+encode_video() {
+ # name the input
+ local vid_input = $1
+ local vid
+
+
+}
+
+# ensure that the necessary directories exist
mkdir -p out/
+mkdir -p out/video/mp4/
+
+# copy robots.txt
+echo "Copying robots.txt"
+cp robots.txt out/
+# process every html site
for inFile in in/*.html; do
[ -e "$inFile" ] || continue
echo "Processing file $inFile"
filename=$(basename "$inFile")
outFile="out/$filename"
- cat header.html > "$outFile"
- cat "$inFile" >> "$outFile"
+
+ # get the title of the file if it is noted in the file
+ site_title=""
+ firstline=$(cat $inFile | head -n1)
+ [[ $firstline == TITLE:* ]] && {
+ site_title="$(echo $firstline | sed 's/TITLE: //g') - "
+ }
+
+ # assemble the page
+ cat header.html | sed "s/%SITE_TITLE%/$site_title/g" > "$outFile"
+ if [[ $firstline == TITLE:* ]]; then
+ cat "$inFile" | tail -n +2 >> "$outFile"
+ else
+ echo -en "\t" && echo "$filename does not seem to have a title!"
+ cat "$inFile" >> "$outFile"
+ fi
+
cat footer.html >> "$outFile"
done
+
+for inFile in videos/*; do
+ sleep 0
+done
+
+
diff --git a/robots.txt b/robots.txt
new file mode 100644
index 0000000..eab12f6
--- /dev/null
+++ b/robots.txt
@@ -0,0 +1,67 @@
+# Block all known AI crawlers and assistants
+# from using content for training AI models.
+# Source: https://robotstxt.com/ai
+User-Agent: GPTBot
+User-Agent: ClaudeBot
+User-Agent: Claude-User
+User-Agent: Claude-SearchBot
+User-Agent: CCBot
+User-Agent: Google-Extended
+User-Agent: Applebot-Extended
+User-Agent: Facebookbot
+User-Agent: Meta-ExternalAgent
+User-Agent: Meta-ExternalFetcher
+User-Agent: diffbot
+User-Agent: PerplexityBot
+User-Agent: Perplexity‑User
+User-Agent: Omgili
+User-Agent: Omgilibot
+User-Agent: webzio-extended
+User-Agent: ImagesiftBot
+User-Agent: Bytespider
+User-Agent: TikTokSpider
+User-Agent: Amazonbot
+User-Agent: Youbot
+User-Agent: SemrushBot-OCOB
+User-Agent: Petalbot
+User-Agent: VelenPublicWebCrawler
+User-Agent: TurnitinBot
+User-Agent: Timpibot
+User-Agent: OAI-SearchBot
+User-Agent: ICC-Crawler
+User-Agent: AI2Bot
+User-Agent: AI2Bot-Dolma
+User-Agent: DataForSeoBot
+User-Agent: AwarioBot
+User-Agent: AwarioSmartBot
+User-Agent: AwarioRssBot
+User-Agent: Google-CloudVertexBot
+User-Agent: PanguBot
+User-Agent: Kangaroo Bot
+User-Agent: Sentibot
+User-Agent: img2dataset
+User-Agent: Meltwater
+User-Agent: Seekr
+User-Agent: peer39_crawler
+User-Agent: cohere-ai
+User-Agent: cohere-training-data-crawler
+User-Agent: DuckAssistBot
+User-Agent: Scrapy
+User-Agent: Cotoyogi
+User-Agent: aiHitBot
+User-Agent: Factset_spyderbot
+User-Agent: FirecrawlAgent
+
+Disallow: /
+DisallowAITraining: /
+
+# Block any non-specified AI crawlers (e.g., new
+# or unknown bots) from using content for training
+# AI models, while allowing the website to be
+# indexed and accessed by bots. These directives
+# are still experimental and may not be supported
+# by all AI crawlers.
+User-Agent: *
+DisallowAITraining: /
+Content-Usage: ai=n
+Allow: /