<?php
// crawler_live_moalmy_final.php

set_time_limit(0);
ini_set('display_errors', 1);
error_reporting(E_ALL);

ob_implicit_flush(true);
@ob_end_flush();

$startUrl = "https://moalmy.com/%D8%A5%D9%84%D8%B9%D8%A8-%D9%88%D8%A7%D8%AA%D8%B9%D9%84%D9%85-%D8%A7%D9%84%D9%81%D8%B5%D9%84%D9%8A%D9%86-%D8%A7%D9%84%D8%A7%D9%88%D9%84-%D9%88%D8%A7%D9%84%D8%AB%D8%A7%D9%86%D9%8A/";

$dataFile       = __DIR__ . "/sitemap_data.json";
$htmlFile       = __DIR__ . "/sitemap.html";
$queueFile      = __DIR__ . "/queue_backup.json";
$dataBackupFile = __DIR__ . "/data_backup.json";
$logFile        = __DIR__ . "/crawler_log.txt";
$failedFile     = __DIR__ . "/failed_urls.json";

$delay = 1;

function log_msg($msg){
    global $logFile;
    file_put_contents($logFile, date("Y-m-d H:i:s") . " - " . $msg . PHP_EOL, FILE_APPEND);
}

function clean_text($text){
    $text = html_entity_decode(strip_tags($text), ENT_QUOTES | ENT_HTML5, "UTF-8");
    $text = preg_replace('/\s+/u', ' ', $text);
    return trim($text);
}

function normalize_url($baseUrl, $href){
    $href = trim($href);

    if($href === '') return '';
    if(preg_match('/^(javascript:|mailto:|tel:|#)/i', $href)) return '';

    if(preg_match('/^https?:\/\//i', $href)){
        return strtok($href, '#');
    }

    $p = parse_url($baseUrl);
    $scheme = $p['scheme'] ?? 'https';
    $host   = $p['host'] ?? 'moalmy.com';

    if(strpos($href, '/') === 0){
        return strtok($scheme . '://' . $host . $href, '#');
    }

    $path = isset($p['path']) ? dirname($p['path']) : '';

    return strtok($scheme . '://' . $host . '/' . trim($path, '/') . '/' . $href, '#');
}

function same_domain($url){
    $host = parse_url($url, PHP_URL_HOST);
    return $host === "moalmy.com" || $host === "www.moalmy.com";
}

function get_html($url){
    $ch = curl_init($url);

    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_SSL_VERIFYHOST => false,
        CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
        CURLOPT_HTTPHEADER => [
            "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language: ar,en-US;q=0.9,en;q=0.8",
            "Cache-Control: no-cache"
        ],
        CURLOPT_TIMEOUT => 35,
        CURLOPT_CONNECTTIMEOUT => 15
    ]);

    $html = curl_exec($ch);
    $err  = curl_error($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);

    curl_close($ch);

    if($err || $code >= 400 || !$html){
        return false;
    }

    return $html;
}

function is_hidden_node($node){
    while($node && $node->nodeType === XML_ELEMENT_NODE){
        $class = strtolower(' ' . $node->getAttribute("class") . ' ');
        $style = strtolower(str_replace([' ', "\n", "\r", "\t"], '', $node->getAttribute("style")));

        if(
            strpos($style, "display:none") !== false ||
            strpos($style, "visibility:hidden") !== false ||
            strpos($style, "opacity:0") !== false ||
            preg_match('/\s(hidden|hide|d-none|invisible|adsbygoogle|google-auto-placed|code-block)\s/', $class)
        ){
            return true;
        }

        $node = $node->parentNode;
    }

    return false;
}

function is_final_page($html){
    libxml_use_internal_errors(true);

    $dom = new DOMDocument();
    @$dom->loadHTML('<?xml encoding="UTF-8">' . $html);

    $xpath = new DOMXPath($dom);

    $nodes = $xpath->query(
        '//div[contains(concat(" ",normalize-space(@class)," ")," blog-detail-header ")]'
    );

    return $nodes->length > 0;
}

function extract_links($html, $currentUrl){
    libxml_use_internal_errors(true);

    $dom = new DOMDocument();
    @$dom->loadHTML('<?xml encoding="UTF-8">' . $html);

    $xpath = new DOMXPath($dom);

    $queries = [
        '//article[contains(concat(" ",normalize-space(@class)," ")," page ")]//div[contains(concat(" ",normalize-space(@class)," ")," entry-content ")]//a[@href]',
        '//div[contains(concat(" ",normalize-space(@class)," ")," entry-content ")]//a[@href]',
        '//div[contains(concat(" ",normalize-space(@class)," ")," content ") or contains(concat(" ",normalize-space(@class)," ")," content1 ")]//a[@href]',
        '//a[contains(concat(" ",normalize-space(@class)," ")," btn ") and @href]',
        '//div[contains(concat(" ",normalize-space(@class)," ")," colum16 ")]//h4/a[@href]',
        '//div[contains(concat(" ",normalize-space(@class)," ")," blog-detail-text ")]//a[@href]'
    ];

    $links = [];
    $seen  = [];

    foreach($queries as $query){
        $nodes = $xpath->query($query);

        foreach($nodes as $a){
            if(is_hidden_node($a)) continue;

            $href = normalize_url($currentUrl, $a->getAttribute("href"));

            if(!$href) continue;
            if(!same_domain($href)) continue;

            $titleNode = $xpath->query('.//span[contains(concat(" ",normalize-space(@class)," ")," title ")]', $a);

            if($titleNode->length){
                $title = clean_text($titleNode->item(0)->textContent);
            } else {
                $title = clean_text($a->textContent);
            }

            if(!$title){
                $title = clean_text($a->getAttribute("title"));
            }

            if(!$title) continue;

            if(isset($seen[$href])) continue;

            $seen[$href] = true;

            $links[] = [
                "url" => $href,
                "title" => $title
            ];
        }

        if(!empty($links)){
            break;
        }
    }

    return $links;
}

function queue_has($queue, $url){
    foreach($queue as $item){
        if(($item["url"] ?? "") === $url){
            return true;
        }
    }
    return false;
}

function data_has($data, $url){
    return isset($data[$url]);
}

function save_failed($url, $title){
    global $failedFile;

    $failed = file_exists($failedFile)
        ? json_decode(file_get_contents($failedFile), true)
        : [];

    if(!is_array($failed)) $failed = [];

    $failed[] = [
        "url" => $url,
        "title" => $title,
        "time" => date("Y-m-d H:i:s")
    ];

    file_put_contents($failedFile, json_encode($failed, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
}

function build_tree_html($data){
    $html = "<ul>";

    foreach($data as $item){
        $title = htmlspecialchars($item["title"] ?? "بدون عنوان");

        $html .= "<li><span class='node'>" . $title . "</span>";

        if(!empty($item["children"])){
            $html .= build_tree_html($item["children"]);
        }

        $html .= "</li>";
    }

    $html .= "</ul>";

    return $html;
}

function save_state($queue, $data){
    global $queueFile, $dataBackupFile, $dataFile, $htmlFile;

    file_put_contents($queueFile, json_encode(array_values($queue), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
    file_put_contents($dataBackupFile, json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
    file_put_contents($dataFile, json_encode(array_values($data), JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
    file_put_contents($htmlFile, "<h2>خريطة الموقع حتى الآن</h2>" . build_tree_html(array_values($data)));
}

$queue = [];
$data  = [];

if(file_exists($queueFile) && file_exists($dataBackupFile)){
    $queue = json_decode(file_get_contents($queueFile), true);
    $data  = json_decode(file_get_contents($dataBackupFile), true);

    if(!is_array($queue)) $queue = [];
    if(!is_array($data)) $data = [];

    if(count($queue) === 0 && count($data) === 0){
        $queue = [["url" => $startUrl, "title" => "البداية", "depth" => 0]];
        log_msg("QUEUE EMPTY - RESET START");
    } elseif(count($queue) === 0 && count($data) > 0){
        $queue = [["url" => $startUrl, "title" => "البداية", "depth" => 0]];
        log_msg("QUEUE EMPTY BUT DATA EXISTS - RESET START");
    } else {
        log_msg("RESUME STARTED");
    }
} else {
    $queue = [["url" => $startUrl, "title" => "البداية", "depth" => 0]];
    $data  = [];
    log_msg("NEW CRAWL STARTED");
}

echo "<!DOCTYPE html><html lang='ar' dir='rtl'><head><meta charset='UTF-8'><title>زحف مباشر</title>
<style>
body{font-family:Tahoma,Arial;background:#f4f6f8;padding:25px}
.node{display:inline-block;background:#f1f5ff;border:1px solid #d9e4ff;padding:7px 12px;border-radius:7px;color:#111;margin:2px}
ul{list-style:none}
.ok{color:green}
.err{color:red}
.info{color:#0b74de}
.end{color:#8a6d00}
</style></head><body>";

echo "<h2>تقدم الزحف على الموقع</h2>";
echo "<p>المتبقي عند البدء: " . count($queue) . "</p>";
flush();

while(!empty($queue)){
    $current = array_shift($queue);

    $url   = $current["url"] ?? "";
    $title = $current["title"] ?? "بدون عنوان";

    if(!$url) continue;

    if(data_has($data, $url)){
        save_state($queue, $data);
        continue;
    }

    echo "<div class='info'>معالجة: <strong>" . htmlspecialchars($title) . "</strong> - " . htmlspecialchars($url) . "</div>";
    flush();

    log_msg("PROCESSING: " . $title . " - " . $url);

    $html = get_html($url);

    if(!$html){
        echo "<div class='err'>فشل تحميل الصفحة: " . htmlspecialchars($url) . "</div>";
        log_msg("FAILED: " . $url);
        save_failed($url, $title);
        save_state($queue, $data);
        continue;
    }

    if($url !== "https://moalmy.com/" && $url !== "https://www.moalmy.com/" && is_final_page($html)){
        $data[$url] = [
            "title" => $title,
            "url" => $url,
            "children" => [],
            "status" => "final_blog_detail_header"
        ];

        echo "<div class='ok'>صفحة نهائية blog-detail-header: " . htmlspecialchars($url) . "</div>";
        flush();

        log_msg("FINAL BLOG DETAIL HEADER: " . $url);

        save_state($queue, $data);
        sleep($delay);
        continue;
    }

    $children = extract_links($html, $url);

    if(empty($children)){
        $data[$url] = [
            "title" => $title,
            "url" => $url,
            "children" => [],
            "status" => "empty"
        ];

        echo "<div class='end'>لا توجد روابط داخل القسم المطلوب</div>";
        flush();

        log_msg("EMPTY: " . $url);

        save_state($queue, $data);
        sleep($delay);
        continue;
    }

    foreach($children as $child){
        $childUrl = $child["url"] ?? "";

        if(!$childUrl) continue;

        if(!data_has($data, $childUrl) && !queue_has($queue, $childUrl)){
            $queue[] = $child;
        }
    }

    $data[$url] = [
        "title" => $title,
        "url" => $url,
        "children" => $children,
        "status" => "done"
    ];

    save_state($queue, $data);

    echo "<div>تم الحفظ — المتبقي: " . count($queue) . "</div>";
    flush();

    sleep($delay);
}

save_state($queue, $data);

log_msg("CRAWL FINISHED");

echo "<h3>الزحف اكتمل! الخريطة محفوظة في sitemap.html</h3>";
echo "</body></html>";