# -*- coding: utf-8 -*-
# -------------------------------------------------------------------------------
# Name:         sfp_crossref
# Purpose:      SpiderFoot plug-in for scanning links identified from the
#               spidering process, and for external links, fetching them to
#               see if those sites link back to the original site, indicating a
#               potential relationship between the external sites.
#
# Author:      Steve Micallef <steve@binarypool.com>
#
# Created:     06/04/2012
# Copyright:   (c) Steve Micallef 2012
# Licence:     MIT
# -------------------------------------------------------------------------------

import re

from spiderfoot import SpiderFootEvent, SpiderFootHelpers, SpiderFootPlugin


class sfp_crossref(SpiderFootPlugin):

    meta = {
        'name': "Cross-Referencer",
        'summary': "Identify whether other domains are associated ('Affiliates') of the target by looking for links back to the target site(s).",
        'flags': [],
        'useCases': ["Footprint"],
        'categories': ["Crawling and Scanning"]
    }

    opts = {
        'checkbase': True
    }

    optdescs = {
        "checkbase": "Check the base URL of the potential affiliate if no direct affiliation found?"
    }

    fetched = None

    def setup(self, sfc, userOpts=dict()):
        self.sf = sfc
        self.fetched = self.tempStorage()

        for opt in list(userOpts.keys()):
            self.opts[opt] = userOpts[opt]

    def watchedEvents(self):
        return [
            'LINKED_URL_EXTERNAL',
            'SIMILARDOMAIN',
            'CO_HOSTED_SITE',
            'DARKNET_MENTION_URL'
        ]

    def producedEvents(self):
        return [
            'AFFILIATE_INTERNET_NAME',
            'AFFILIATE_WEB_CONTENT'
        ]

    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        # SIMILARDOMAIN and CO_HOSTED_SITE events are domains, not URLs.
        # Assume HTTP.
        if eventName in ['SIMILARDOMAIN', 'CO_HOSTED_SITE']:
            url = 'http://' + eventData.lower()
        elif 'URL' in eventName:
            url = eventData
        else:
            return

        fqdn = self.sf.urlFQDN(url)

        # We are only interested in external sites for the crossref
        if self.getTarget().matches(fqdn):
            self.debug(f"Ignoring {url} as not external")
            return

        if eventData in self.fetched:
            self.debug(f"Ignoring {url} as already tested")
            return

        if not self.sf.resolveHost(fqdn) and not self.sf.resolveHost6(fqdn):
            self.debug(f"Ignoring {url} as {fqdn} does not resolve")
            return

        self.fetched[url] = True

        self.debug(f"Testing URL for affiliation: {url}")

        res = self.sf.fetchUrl(
            url,
            timeout=self.opts['_fetchtimeout'],
            useragent=self.opts['_useragent'],
            sizeLimit=10000000,
            verify=False
        )

        if res['content'] is None:
            self.debug(f"Ignoring {url} as no data returned")
            return

        matched = False
        for name in self.getTarget().getNames():
            # Search for mentions of our host/domain in the external site's data
            pat = re.compile(
                r"([\.\'\/\"\ ]" + re.escape(name) + r"[\.\'\/\"\ ])",
                re.IGNORECASE
            )
            matches = re.findall(pat, str(res['content']))

            if len(matches) > 0:
                matched = True
                break

        if not matched:
            # If the name wasn't found in the affiliate, and checkbase is set,
            # fetch the base URL of the affiliate to check for a crossref.
            if eventName == "LINKED_URL_EXTERNAL" and self.opts['checkbase']:
                # Check the base url to see if there is an affiliation
                url = SpiderFootHelpers.urlBaseUrl(eventData)
                if url in self.fetched:
                    return

                self.fetched[url] = True

                res = self.sf.fetchUrl(
                    url,
                    timeout=self.opts['_fetchtimeout'],
                    useragent=self.opts['_useragent'],
                    sizeLimit=10000000,
                    verify=False
                )

                if res['content'] is not None:
                    for name in self.getTarget().getNames():
                        pat = re.compile(
                            r"([\.\'\/\"\ ]" + re.escape(name) + r"[\'\/\"\ ])",
                            re.IGNORECASE
                        )
                        matches = re.findall(pat, str(res['content']))

                        if len(matches) > 0:
                            matched = True
                            break

        if not matched:
            return

        if not event.moduleDataSource:
            event.moduleDataSource = "Unknown"

        self.info(f"Found link to target from affiliate: {url}")

        evt1 = SpiderFootEvent(
            "AFFILIATE_INTERNET_NAME",
            self.sf.urlFQDN(url),
            self.__name__,
            event
        )
        evt1.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt1)

        evt2 = SpiderFootEvent(
            "AFFILIATE_WEB_CONTENT",
            res['content'],
            self.__name__,
            evt1
        )
        evt2.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt2)

# End of sfp_crossref class