Add missed scripts dir.
All checks were successful
Build site / build (push) Successful in 12s

This commit is contained in:
2026-03-11 00:12:47 +05:30
parent 525614fb3d
commit 1b2e212133
3 changed files with 517 additions and 0 deletions

109
scripts/generate_site.py Normal file
View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""Generate Zola content pages from space data in entries.yml.
Usage: python generate_site.py <entries.yml> <content_dir>
"""
import csv
import os
import re
import sys
import yaml
def slugify(text):
return re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
def to_toml_array(items):
return "[" + ", ".join(f'"{i}"' for i in items) + "]"
def main():
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <entries.yml> <content_dir>", file=sys.stderr)
sys.exit(1)
entries_path = sys.argv[1]
content_dir = sys.argv[2]
with open(entries_path) as f:
entries = yaml.safe_load(f)
# Derive sibling paths from content_dir (e.g. content/spaces -> content, static)
site_root = os.path.dirname(os.path.dirname(content_dir))
os.makedirs(content_dir, exist_ok=True)
# Main section index
parent_content_dir = os.path.dirname(content_dir)
with open(os.path.join(parent_content_dir, "_index.md"), "w") as f:
f.write("+++\ntitle = \"Community & Public Spaces in India\"\nsort_by = \"title\"\n+++\n")
# Spaces section index (transparent so entries appear under root)
with open(f"{content_dir}/_index.md", "w") as f:
f.write("+++\ntitle = \"Spaces\"\nsort_by = \"title\"\ntransparent = true\n+++\n")
# Detect duplicate name slugs and disambiguate with city
from collections import Counter
slug_counts = Counter(slugify(e["name"]) for e in entries)
slug_seen = Counter()
for entry in entries:
base_slug = slugify(entry["name"])
if slug_counts[base_slug] > 1:
slug = f"{base_slug}-{slugify(entry['city'])}"
else:
slug = base_slug
coords = entry.get("coords", [0, 0])
frontmatter = f"""+++
title = "{entry['name'].replace('"', '\\"')}"
description = "{entry.get('description', '').replace('"', '\\"')}, {entry['city']}"
[taxonomies]
states = ["{entry['state']}"]
cities = ["{entry['city']}"]
categories = {to_toml_array(entry.get('categories', []))}
[extra]
address = "{entry.get('address', '').replace('"', '\\"')}"
pincode = "{entry.get('pincode', '')}"
lat = {coords[0]}
lng = {coords[1]}
url = "{entry.get('url', '')}"
tags = {to_toml_array(entry.get('tags', []))}
+++
"""
body = entry.get('description', '')
filepath = os.path.join(content_dir, f"{slug}.md")
with open(filepath, "w") as f:
f.write(frontmatter)
f.write(f"\n{body}\n")
# Generate CSV for download
static_dir = os.path.join(site_root, "static")
os.makedirs(static_dir, exist_ok=True)
csv_path = os.path.join(static_dir, "spaces.csv")
with open(csv_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "city", "state", "address", "pincode", "lat", "lng", "url", "description", "categories", "tags"])
for entry in entries:
coords = entry.get("coords", [0, 0])
writer.writerow([
entry["name"],
entry["city"],
entry["state"],
entry.get("address", ""),
entry.get("pincode", ""),
coords[0],
coords[1],
entry.get("url", ""),
entry.get("description", ""),
", ".join(entry.get("categories", [])),
", ".join(entry.get("tags", [])),
])
print(f"generated {len(entries)} content pages + {csv_path}")
if __name__ == "__main__":
main()

94
scripts/geocode.py Normal file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
import json
import re
import time
import urllib.request
import urllib.parse
import yaml
API_KEY_FILE = ".google-key"
ENTRIES_FILE = "../data/entries.yml"
OUTPUT_FILE = "../data/geocode_raw.json"
MULTI_KEYWORDS = ["multiple", "pan-india", "across india"]
def is_multi_location(entry):
addr = entry.get("address", "").lower()
city = entry.get("city", "").lower()
return any(k in addr or k in city for k in MULTI_KEYWORDS)
def build_query(entry):
name = entry.get("name", "")
city = entry.get("city", "")
address = entry.get("address", "")
if is_multi_location(entry):
# For multilocation entries, geocode the primary city.
# Extract a real city name if city is "Various"
if city.lower() == "various":
# Try to extract a city from the address
return address + ", India"
return city + ", India"
# Strip existing pincode from address for cleaner query
addr_clean = re.sub(r'\b\d{6}\b', '', address).strip().rstrip(',').strip()
if addr_clean:
return addr_clean + ", " + city + ", India"
else:
return city + ", India"
def geocode(query, api_key):
url = (
"https://maps.googleapis.com/maps/api/geocode/json?"
+ urllib.parse.urlencode({"address": query, "key": api_key})
)
with urllib.request.urlopen(url) as resp:
return json.loads(resp.read().decode())
def main():
with open(API_KEY_FILE) as f:
api_key = f.read().strip()
with open(ENTRIES_FILE) as f:
entries = yaml.safe_load(f)
print(f"Loaded {len(entries)} entries")
results = []
for i, entry in enumerate(entries):
query = build_query(entry)
print(f"[{i+1:3d}/{len(entries)}] {entry['name'][:40]:40s}{query[:60]}")
try:
resp = geocode(query, api_key)
status = resp.get("status", "UNKNOWN")
if status != "OK":
print(f"not ok={status}")
except Exception as e:
print(f"Error: {e}")
resp = {"status": "ERROR", "error": str(e)}
results.append({
"index": i,
"name": entry["name"],
"query": query,
"response": resp,
})
time.sleep(0.1)
with open(OUTPUT_FILE, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
ok = sum(1 for r in results if r["response"].get("status") == "OK")
print(f"\nDone. {ok}/{len(results)}. Saved to {OUTPUT_FILE}")
if __name__ == "__main__":
main()

314
scripts/merge_addresses.py Normal file
View File

@@ -0,0 +1,314 @@
#!/usr/bin/env python3
import json
import re
import yaml
ENTRIES_FILE = "../data/entries.yml"
GEOCODE_FILE = "../data/geocode_raw.json"
OUTPUT_FILE = "../data/entries.yml"
MULTI_KEYWORDS = ["multiple", "pan-india", "across india"]
# Header comment to preserve
HEADER = "# Community/Public spaces directory - India.\n# https://ooru.space\n"
def extract_components(result):
"""Extract structured address components from a geocode API result."""
if not result.get("response", {}).get("results"):
return {}
geo = result["response"]["results"][0]
comps = {}
for c in geo.get("address_components", []):
for t in c["types"]:
comps[t] = c["long_name"]
loc = geo.get("geometry", {}).get("location", {})
comps["lat"] = loc.get("lat")
comps["lng"] = loc.get("lng")
return comps
def extract_pincode(entry, comps):
"""Extract pincode: prefer original embedded pincode, fall back to API postal_code."""
# From original address (user-provided, more specific)
addr = entry.get("address", "")
m = re.search(r'\b(\d{6})\b', addr)
if m:
return m.group(1)
# From API
pin = comps.get("postal_code")
if pin and re.match(r'^\d{6}$', str(pin)):
return str(pin)
return None
def strip_pincode(address):
"""Remove 6-digit pincode from address string."""
return re.sub(r'\s*\b\d{6}\b', '', address).strip().rstrip(',').strip()
def normalize(s):
"""Normalize string for comparison."""
return re.sub(r'\s+', ' ', s).strip().lower()
def is_noisy_sublocality(s):
"""Detect noisy API sublocalities (apartment complexes, colonies named after buildings)."""
noise_patterns = [
r'(?i)apparte?ments?$', r'(?i)towers?$', r'(?i)heights?$',
r'(?i)residenc[ey]$', r'(?i)complex$', r'(?i)plaza$',
r'(?i)society$', r'(?i)enclave$',
]
return any(re.search(p, s.strip()) for p in noise_patterns)
def dedupe_parts(parts):
"""Remove duplicate and near-duplicate address parts (case-insensitive, substring-aware)."""
result = []
for p in parts:
p = p.strip().rstrip(',').strip()
if not p:
continue
pn = normalize(p)
# Check if this part is already covered by an existing part (substring match)
already_covered = False
to_remove = None
for i, existing in enumerate(result):
en = normalize(existing)
if pn == en:
already_covered = True
break
# If existing contains this part as a substring (e.g., "New Delhi" covers "Delhi")
if len(pn) > 3 and pn in en:
already_covered = True
break
# If this part contains existing as substring, replace existing with this
if len(en) > 3 and en in pn:
to_remove = i
break
if to_remove is not None:
result[to_remove] = p
elif not already_covered:
result.append(p)
return result
def build_improved_address(entry, comps):
"""Build improved address by merging original details with API sublocalities.
Returns the address string with state removed (state goes to its own field).
"""
original = entry.get("address", "")
city = entry.get("city", "")
# Strip pincode from original
original_clean = strip_pincode(original)
# Parse original into parts
orig_parts = [p.strip() for p in original_clean.split(",") if p.strip()]
# API-provided sublocalities and locality
api_parts = []
for key in ["sublocality_level_2", "sublocality_level_1", "locality"]:
val = comps.get(key)
if val:
api_parts.append(val)
# State from API
state = comps.get("administrative_area_level_1", "")
# Collect all known parts from original address (normalized for dedup)
orig_normalized = {normalize(p) for p in orig_parts}
# Add API sublocalities that aren't already in the original (skip noisy ones)
# Also skip if a fuzzy match exists (e.g., "Khirki Extension" vs "Khirkee Extension")
enrichment = []
for ap in api_parts:
apn = normalize(ap)
if apn == normalize(city):
continue
if is_noisy_sublocality(ap):
continue
# Check exact match
if apn in orig_normalized:
continue
# Check fuzzy substring containment (either direction)
fuzzy_match = False
for op in orig_normalized:
# If >60% of characters overlap, skip (handles spelling variants)
if len(apn) > 4 and len(op) > 4:
common = len(set(apn.split()) & set(op.split()))
if common > 0:
fuzzy_match = True
break
# Substring containment
if apn in op or op in apn:
fuzzy_match = True
break
if not fuzzy_match:
enrichment.append(ap)
# Limit API enrichment: if original already has 3+ parts, add at most 1 sublocality
if len(orig_parts) >= 3 and len(enrichment) > 1:
enrichment = enrichment[:1]
# Build final address parts:
# Start with original (granular details first), then add missing sublocalities
all_parts = list(orig_parts)
# Insert API sublocalities after the most granular parts but before city/state
# Find where city/state appear in original to insert before them
insert_pos = len(all_parts)
for i, p in enumerate(all_parts):
pn = normalize(p)
if pn == normalize(city) or pn == normalize(state) or (state and pn == normalize(state)):
insert_pos = i
break
# Also check for district-level matches
district = comps.get("administrative_area_level_2", "")
if district and pn == normalize(district):
insert_pos = i
break
for j, ep in enumerate(enrichment):
all_parts.insert(insert_pos + j, ep)
# Deduplicate
final = dedupe_parts(all_parts)
# Remove "India" and state from address (state goes to its own field)
final = [p for p in final if normalize(p) != "india"]
if state:
final = [p for p in final if normalize(p) != normalize(state)]
return ", ".join(final)
def yaml_quote(s):
"""Quote a string for YAML output."""
if s is None:
return '""'
# Use double quotes, escape internal double quotes
s = str(s).replace('\\', '\\\\').replace('"', '\\"')
return f'"{s}"'
def write_yaml(entries, filepath):
"""Write entries to YAML with exact formatting control."""
lines = [HEADER]
for entry in entries:
lines.append(f'- name: {yaml_quote(entry["name"])}')
lines.append(f' city: {yaml_quote(entry["city"])}')
if entry.get("state"):
lines.append(f' state: {yaml_quote(entry["state"])}')
lines.append(f' address: {yaml_quote(entry["address"])}')
if entry.get("pincode"):
lines.append(f' pincode: {yaml_quote(entry["pincode"])}')
if entry.get("coords"):
lat, lng = entry["coords"]
lines.append(f' coords: [{lat}, {lng}]')
lines.append(f' url: {yaml_quote(entry["url"])}')
lines.append(f' description: {yaml_quote(entry["description"])}')
lines.append(' categories:')
for cat in entry.get("categories", []):
lines.append(f' - {cat}')
if "tags" in entry:
if entry["tags"]:
lines.append(' tags:')
for tag in entry["tags"]:
lines.append(f' - {tag}')
else:
lines.append(' tags: []')
lines.append("") # blank line between entries
with open(filepath, "w") as f:
f.write("\n".join(lines))
def main():
with open(ENTRIES_FILE) as f:
entries = yaml.safe_load(f)
with open(GEOCODE_FILE) as f:
geocode_results = json.load(f)
print(f"Loaded {len(entries)} entries, {len(geocode_results)} geocode results")
stats = {"improved": 0, "pin_api": 0, "pin_regex": 0, "no_pin": 0, "no_result": 0}
for i, entry in enumerate(entries):
geo = geocode_results[i]
comps = extract_components(geo)
if not comps:
print(f"no geocode result for: {entry['name']}")
stats["no_result"] += 1
# Still strip pincode from address if present
pin_match = re.search(r'\b(\d{6})\b', entry.get("address", ""))
if pin_match:
entry["pincode"] = pin_match.group(1)
entry["address"] = strip_pincode(entry["address"])
continue
# Check if it's a multi-location entry
addr_lower = entry.get("address", "").lower()
city_lower = entry.get("city", "").lower()
is_multi = any(k in addr_lower or k in city_lower for k in MULTI_KEYWORDS)
# Extract pincode (skip for multi-location with "Various" city)
if is_multi and city_lower == "various":
stats["no_pin"] += 1
else:
pincode = extract_pincode(entry, comps)
if pincode:
entry["pincode"] = pincode
if comps.get("postal_code") and re.match(r'^\d{6}$', str(comps["postal_code"])):
stats["pin_api"] += 1
else:
stats["pin_regex"] += 1
else:
stats["no_pin"] += 1
# Extract state from API
api_state = comps.get("administrative_area_level_1", "")
if api_state:
entry["state"] = api_state
if not is_multi:
entry["address"] = build_improved_address(entry, comps)
else:
# For multi-location entries, still strip state from address if present
if api_state:
parts = [p.strip() for p in entry["address"].split(",")]
parts = [p for p in parts if normalize(p) != normalize(api_state)]
entry["address"] = ", ".join(parts)
# Set coords (rounded to 4 decimals)
if comps.get("lat") is not None and comps.get("lng") is not None:
entry["coords"] = [round(comps["lat"], 4), round(comps["lng"], 4)]
stats["improved"] += 1
print(f"\nStats:")
print(f" improved: {stats['improved']}")
print(f" PIN from API: {stats['pin_api']}")
print(f" PIN from regex: {stats['pin_regex']}")
print(f" No PIN found: {stats['no_pin']}")
print(f" No geocode result: {stats['no_result']}")
write_yaml(entries, OUTPUT_FILE)
print(f"\nWritten to {OUTPUT_FILE}")
if __name__ == "__main__":
main()