updated the generator. Added a robots.txtHEAD main

author: zeldakatze <coffee@zeldakatze.de> 2025-10-08 10:23:53 +0200
committer: zeldakatze <coffee@zeldakatze.de> 2025-10-08 10:23:53 +0200
commit: efbe8e78a32e14e791f764ae80397c5d5b13f3b9 (patch)
tree: d345547ac0fdad306b9c1fd9af194afad54fbb40
parent: d0c9031ef9cbe6ea8e92a5f8d2114fad524864f4 (diff)
download: website-efbe8e78a32e14e791f764ae80397c5d5b13f3b9.tar.gz
website-efbe8e78a32e14e791f764ae80397c5d5b13f3b9.zip
5 files changed, 120 insertions, 4 deletions
diff --git a/in/index.html b/in/index.html
index cc8de7a..c3ea0b5 100644
--- a/in/index.html
+++ b/in/index.html
@@ -1,3 +1,4 @@
+TITLE: Index
 <h2>Über mich</h2>
 <p>
 Hallo, ich bin Maite (sie/ihr). Ich bastel gerne mit Technik rum, mache
diff --git a/in/smallunix.html b/in/smallunix.html
index 5f9167a..475226b 100644
--- a/in/smallunix.html
+++ b/in/smallunix.html
@@ -43,7 +43,7 @@ haben.</p>
 
 <h2>Quellcode</h2>
 <p>Der Quellcode kann auf meiner cgit-Instanz
-(<a href='https://cgit.zeldakatze.de/smallUnix/'>Hier</a>) gefunden werden.
+(<a href="https://cgit.zeldakatze.de/smallUnix/">Hier</a>) gefunden werden.
 Zum Kompilieren gibt es ein eigenes Toolchain, das zunächst kompiliert werden
 muss. Anschließend kann das System recht leicht mit ./build.sh kompiliert werden
 </p>
diff --git a/in/tinkering.html b/in/tinkering.html
index 567e514..5bc38bb 100644
--- a/in/tinkering.html
+++ b/in/tinkering.html
@@ -1,8 +1,9 @@
+TITLE: Basteleien
 Ich habe derzeit einige Projekte an denen ich Arbeite, einige davon sind hier
 aufgezählt.
 
 
-<a href='smallunix.html'><h2>smallUnix</h2></a>
+<a href="smallunix.html"><h2>smallUnix</h2></a>
 Ein sich Unix-Ähnlich anfühlendes System, das nicht-kontinuierliche Adressräume
 verwendet.
 
diff --git a/qdsg.sh b/qdsg.sh
index 28bbb07..41cd9b5 100755
--- a/qdsg.sh
+++ b/qdsg.sh
@@ -1,14 +1,61 @@
 #!/bin/bash
 
+run_ffmpeg_if_not_exist() {
+	local vid_input = $1
+	local out_basename = $2
+	local vid_codec = $3
+	
+	ffmpeg -i "$vid_input" -c:v "$vid_codec"
+}
+
+# generates multiple resolution for the given video files.
+# $1: file name
+# @return 
+encode_video() {
+	# name the input
+	local vid_input = $1
+	local vid
+	
+	
+}
+
+# ensure that the necessary directories exist
 mkdir -p out/
+mkdir -p out/video/mp4/
+
+# copy robots.txt
+echo "Copying robots.txt"
+cp robots.txt out/
 
+# process every html site
 for inFile in in/*.html; do
 	[ -e "$inFile" ] || continue
 	
 	echo "Processing file $inFile"
 	filename=$(basename "$inFile")
 	outFile="out/$filename"
-	cat header.html > "$outFile"
-	cat "$inFile" >> "$outFile"
+	
+	# get the title of the file if it is noted in the file
+	site_title=""
+	firstline=$(cat $inFile | head -n1)
+	[[ $firstline == TITLE:* ]] && {
+		site_title="$(echo $firstline | sed 's/TITLE: //g') - "
+	}
+	
+	# assemble the page
+	cat header.html | sed "s/%SITE_TITLE%/$site_title/g" > "$outFile"
+	if [[ $firstline == TITLE:* ]]; then
+		cat "$inFile" | tail -n +2 >> "$outFile"
+	else
+		echo -en "\t" && echo "$filename does not seem to have a title!"
+		cat "$inFile" >> "$outFile"
+	fi
+	
 	cat footer.html >> "$outFile"
 done
+
+for inFile in videos/*; do
+	sleep 0
+done
+
+
diff --git a/robots.txt b/robots.txt
new file mode 100644
index 0000000..eab12f6
--- /dev/null
+++ b/robots.txt
@@ -0,0 +1,67 @@
+# Block all known AI crawlers and assistants
+# from using content for training AI models.
+# Source: https://robotstxt.com/ai
+User-Agent: GPTBot
+User-Agent: ClaudeBot
+User-Agent: Claude-User
+User-Agent: Claude-SearchBot
+User-Agent: CCBot
+User-Agent: Google-Extended
+User-Agent: Applebot-Extended
+User-Agent: Facebookbot
+User-Agent: Meta-ExternalAgent
+User-Agent: Meta-ExternalFetcher
+User-Agent: diffbot
+User-Agent: PerplexityBot
+User-Agent: Perplexity‑User
+User-Agent: Omgili
+User-Agent: Omgilibot
+User-Agent: webzio-extended
+User-Agent: ImagesiftBot
+User-Agent: Bytespider
+User-Agent: TikTokSpider
+User-Agent: Amazonbot
+User-Agent: Youbot
+User-Agent: SemrushBot-OCOB
+User-Agent: Petalbot
+User-Agent: VelenPublicWebCrawler
+User-Agent: TurnitinBot
+User-Agent: Timpibot
+User-Agent: OAI-SearchBot
+User-Agent: ICC-Crawler
+User-Agent: AI2Bot
+User-Agent: AI2Bot-Dolma
+User-Agent: DataForSeoBot
+User-Agent: AwarioBot
+User-Agent: AwarioSmartBot
+User-Agent: AwarioRssBot
+User-Agent: Google-CloudVertexBot
+User-Agent: PanguBot
+User-Agent: Kangaroo Bot
+User-Agent: Sentibot
+User-Agent: img2dataset
+User-Agent: Meltwater
+User-Agent: Seekr
+User-Agent: peer39_crawler
+User-Agent: cohere-ai
+User-Agent: cohere-training-data-crawler
+User-Agent: DuckAssistBot
+User-Agent: Scrapy
+User-Agent: Cotoyogi
+User-Agent: aiHitBot
+User-Agent: Factset_spyderbot
+User-Agent: FirecrawlAgent
+
+Disallow: /
+DisallowAITraining: /
+
+# Block any non-specified AI crawlers (e.g., new
+# or unknown bots) from using content for training
+# AI models, while allowing the website to be
+# indexed and accessed by bots.  These directives
+# are still experimental and may not be supported
+# by all AI crawlers.
+User-Agent: *
+DisallowAITraining: /
+Content-Usage: ai=n
+Allow: /
author	zeldakatze <coffee@zeldakatze.de>	2025-10-08 10:23:53 +0200
committer	zeldakatze <coffee@zeldakatze.de>	2025-10-08 10:23:53 +0200
commit	efbe8e78a32e14e791f764ae80397c5d5b13f3b9 (patch)
tree	d345547ac0fdad306b9c1fd9af194afad54fbb40
parent	d0c9031ef9cbe6ea8e92a5f8d2114fad524864f4 (diff)
download	website-efbe8e78a32e14e791f764ae80397c5d5b13f3b9.tar.gz website-efbe8e78a32e14e791f764ae80397c5d5b13f3b9.zip