# robots.txt for M2 InovaTec (www.m2inovatec.com)
#
# This file instructs search engine crawlers (Google, Bing, etc.) on which
# parts of the website they should and should not crawl and index.
#
# Best Practices (2025):
# - Keep it simple and well-commented
# - Use Allow directives to explicitly permit important paths
# - Use Disallow to block API endpoints, admin areas, and private content
# - Include sitemap location for efficient crawling
# - Test in Google Search Console regularly
#
# Last Updated: 2025-11-08
# Author: M2 InovaTec

# =============================================================================
# Global Rules - Apply to all search engine crawlers
# =============================================================================

User-agent: *

# Allow all public content by default
Allow: /

# Allow important public directories
Allow: /images/
Allow: /locales/
Allow: /*.css$
Allow: /*.js$

# Disallow API endpoints (not meant for search indexing)
Disallow: /api/

# Disallow JSON locale files from being indexed as separate pages
# (they are used by the application for translations, not standalone content)
Disallow: /locales/*.json$

# Disallow any potential temporary or cache directories
Disallow: /tmp/
Disallow: /cache/

# =============================================================================
# Sitemap Location
# =============================================================================

# XML Sitemap for efficient crawling and indexing
# Helps search engines discover all important pages quickly
Sitemap: https://www.m2inovatec.com/sitemap.xml

# =============================================================================
# Crawler-Specific Rules (Optional)
# =============================================================================

# GoogleBot - Google's web crawler
# Uses default rules (User-agent: *)
User-agent: Googlebot
Allow: /

# Googlebot-Image - Google's image crawler
# Allow crawling of all images
User-agent: Googlebot-Image
Allow: /images/

# Bingbot - Microsoft Bing's web crawler
# Uses default rules (User-agent: *)
User-agent: Bingbot
Allow: /

# =============================================================================
# AI Crawler Rules (2025 Best Practices)
# =============================================================================

# GPTBot - OpenAI's web crawler for training ChatGPT
User-agent: GPTBot
Allow: /

# Claude-Web - Anthropic's web crawler
User-agent: Claude-Web
Allow: /

# Google-Extended - Google's AI training crawler
User-agent: Google-Extended
Allow: /

# CCBot - Common Crawl bot (used by various AI companies)
User-agent: CCBot
Allow: /

# =============================================================================
# Bad Bot Protection
# =============================================================================

# Block known bad bots, scrapers, and spam bots
# These are aggressive crawlers that don't respect rate limits

User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

# =============================================================================
# Crawl Delay (Optional)
# =============================================================================

# Uncomment the following to request crawlers wait between requests
# This helps reduce server load from aggressive crawlers
# Note: Not all crawlers respect this directive

# User-agent: *
# Crawl-delay: 1

# =============================================================================
# Additional Resources
# =============================================================================

# For LLM-specific documentation and structured data:
# - LLM Index: https://www.m2inovatec.com/llms.txt
# - LLM Full Docs: https://www.m2inovatec.com/llms-full.txt
# - AI Manifest: https://www.m2inovatec.com/.well-known/ai.txt
#
# For Schema.org structured data, see the JSON-LD in our HTML pages