# -*- coding: utf-8 -*- # ------------------------------------------------------------------------------- # Name: sfp_template # Purpose: Example module to use for new modules. # # Author: Steve Micallef # # Created: 2020-04-21 # Copyright: (c) Steve Micallef # Licence: MIT # ------------------------------------------------------------------------------- import json from netaddr import IPNetwork from spiderfoot import SpiderFootEvent, SpiderFootPlugin class sfp_template(SpiderFootPlugin): # The module descriptor dictionary contains all the meta data about a module necessary # for users to understand... meta = { # Module name: A very short but human readable name for the module. 'name': "Template Module", # Description: A sentence briefly describing the module. 'summary': "This is an example module to help developers create their own SpiderFoot modules.", # Flags: Attributes about this module: # - apikey: Needs an API key to function # - slow: Can be slow to find information # - errorprone: Might generate high false positives # - invasive: Interrogates the target, might be intensive # - tool: Runs an external tool to collect data 'flags': ["slow", "apikey"], # Use cases: The use case(s) this module should be included in, options are Footprint, Investigate and Passive. # - Passive means the user's scan target is not contacted at all # - Footprint means that this module is useful when understanding the target's footprint on the Internet # - Investigate means that this module is useful when investigating the danger/risk of a target 'useCases': ["Passive"], # Categories: The categories this module belongs in, describing how it operates. Only the first category is # used for now. # - Content Analysis: Analyses content found # - Crawling and Scanning: Performs crawling or scanning of the target # - DNS: Queries DNS # - Leaks, Dumps and Breaches: Queries data dumps and breaches # - Passive DNS: Analyses passive DNS sources # - Public Registries: Queries open/public registries of information # - Real World: Queries sources about the real world (addresses, names, etc.) # - Reputation Systems: Queries systems that describe the reputation of other systems # - Search Engines: Searches public search engines with data about the whole Internet # - Secondary Networks: Queries information about participation on secondary networks, like Bitcoin # - Social Media: Searches social media data sources 'categories': ["Social Media"], # For tool modules, have some basic information about the tool. 'toolDetails': { # The name of the tool 'name': "Nmap", # Descriptive text about the tool 'description': "Detailed descriptive text about the tool", # The website URL for the tool. In many cases this will also be the # repo, but no harm in duplicating it. 'website': 'https://tool.org', # The repo where the code of the tool lives. 'repository': 'https://github.com/author/tool' }, # Information about the data source (if any) this module queries for data. For modules # that purley parse data from other modules (e.g. sfp_email), this may be omitted. 'dataSource': { # The primary website URL for the data source. 'website': "https://www.datasource.com", # The subscription model for this data source. # - FREE_NOAUTH_UNLIMITED: Completely free, no need to obtain an API key and no limits # imposed beyond throttling. # - FREE_NOAUTH_LIMITED: Completely free, no need to obtain an API key however limits # are imposed and you need to register/pay to exceed them. # - FREE_AUTH_UNLIMITED: Completely free, however you must obtain an API key to access # the service with no limits imposed beyond throttling. # - FREE_AUTH_LIMITED: Completely free, however you must obtain an API key and limits # are imposed. You need to upgrade(pay) to exceed them. # - COMMERCIAL_ONLY: No free tier is available at all. # - PRIVATE_ONLY: Invite only. Usually for betas and similar programs. 'model': "FREE_NOAUTH_LIMITED", # Links to additional information. May be omitted. 'references': [ "https://www.datasource.com/api-documentation" ], # If an API key is optional or required, information on how to obtain the API key. # Each array element is a step. Ensure URLs are full URLs so they can be linked # automatically in the UI. 'apiKeyInstructions': [ "Visit https://www.datasource.com", "Register a free account", "Click on 'Account Settings'", "Click on 'Developer'", "The API key is listed under 'Your API Key'" ], # URL of the favicon for the data source. 'favIcon': "https://www.datasource.com/favicon.ico", # URL of the full-size logo for the data source. 'logo': "https://www.datasource.com/logo.gif", # A paragraph or two about the data source. 'description': "A paragraph of text with details about the data source / services. " "Keep things neat by breaking the text up across multiple lines as " "has been done here. If line breaks are needed for breaking up " "multiple paragraphs, use \n.", } } # Default options. Delete any options not applicable to this module. Descriptions for each option # are defined in optdescs below. Options won't show up in the UI if they don't have an entry in # optdescs. This can be useful when you want something configured in code but not by the user. # # Note that these are just dictionary entries. The logic for how you react to these settings # is entirely for you to define AND IMPLEMENT in this module - nothing comes for free! :) # # Look at other modules for examples for how these settings are handled and implemented. # opts = { # If the module needs an API key, ensure api_key is in the name so that it gets # picked up as such in the UI. 'api_key': '', # If the module accepts CO_HOSTED_SITE as an event, it sometimes makes sense to make # that configurable since some users don't care about co-hosted sites. 'checkcohosts': True, # As above, but for affiliates. 'checkaffiliates': True, # As above, but for NETBLOCK_MEMBERs. 'subnetlookup': False, # As abovem but for NETBLOCK_OWNER 'netblocklookup': True, # If subnetlookup is true, what's the maximum size subnet to iterate through? 'maxsubnet': 24, # As above but for netblocks owned. 'maxnetblock': 24, # For modules reporting CO_HOSTED_SITE events, it makes sense to put a cap # on how many to return since a high number usually indicates hosting, and users # likely do not care about such cases. 'maxcohost': 100, # When reporting hosts, perform DNS lookup to check if they still resolve, and # if not report INTERNET_NAME_UNRESOLVED instead, if appropriate. 'verify': True, # If reporting co-hosted sites, consider a site co-hosted if its domain matches # the target? "cohostsamedomain": False } # Option descriptions. Delete any options not applicable to this module. optdescs = { "api_key": "SomeDataource API Key.", 'checkcohosts': "Check co-hosted sites?", 'checkaffiliates': "Check affiliates?", 'netblocklookup': "Look up all IPs on netblocks deemed to be owned by your target for possible blacklisted hosts on the same target subdomain/domain?", 'maxnetblock': "If looking up owned netblocks, the maximum netblock size to look up all IPs within (CIDR value, 24 = /24, 16 = /16, etc.)", 'subnetlookup': "Look up all IPs on subnets which your target is a part of?", 'maxsubnet': "If looking up subnets, the maximum subnet size to look up all the IPs within (CIDR value, 24 = /24, 16 = /16, etc.)", 'maxcohost': "Stop reporting co-hosted sites after this many are found, as it would likely indicate web hosting.", "cohostsamedomain": "Treat co-hosted sites on the same target domain as co-hosting?", 'verify': 'Verify that any hostnames found on the target domain still resolve?' } # Tracking results can be helpful to avoid reporting/processing duplicates results = None # Tracking the error state of the module can be useful to detect when a third party # has failed and you don't wish to process any more events. errorState = False def setup(self, sfc, userOpts=dict()): self.sf = sfc # self.tempStorage() basically returns a dict(), but we use self.tempStorage() # instead since on SpiderFoot HX, different mechanisms are used to persist # data for load distribution, avoiding excess memory consumption and fault # tolerance. This keeps modules transparently compatible with both versions. self.results = self.tempStorage() # Clear / reset any other class member variables here # or you risk them persisting between threads. # The data source for a module is, by default, set to the module name. # If you want to override that, for instance in cases where the module # is purely processing data from other modules instead of producing # data itself, you can do so with the following. Note that this is only # utilised in SpiderFoot HX and not the open source version. self.__dataSource__ = "Some Data Source" for opt in list(userOpts.keys()): self.opts[opt] = userOpts[opt] # What events is this module interested in for input # For a list of all events, check spiderfoot/db.py. def watchedEvents(self): return [ "IP_ADDRESS", "NETBLOCK_OWNER", "DOMAIN_NAME", "WEB_ANALYTICS_ID" ] # What events this module produces def producedEvents(self): return [ "OPERATING_SYSTEM", "DEVICE_TYPE", "TCP_PORT_OPEN", "TCP_PORT_OPEN_BANNER", 'RAW_RIR_DATA', 'GEOINFO', 'VULNERABILITY_GENERAL' ] # When querying third parties, it's best to have a dedicated function # to do so and avoid putting it in handleEvent() def query(self, qry): # This is an example of querying SHODAN. Note that the fetch timeout # is inherited from global options (options prefixed with _ will come # from global config), and the user agent is SpiderFoot so that the # provider knows the request comes from the tool. Many third parties # request that, so best to just be consistent anyway. res = self.sf.fetchUrl( f"https://api.shodan.io/shodan/host/{qry}?key={self.opts['api_key']}", timeout=self.opts['_fetchtimeout'], useragent="SpiderFoot" ) # Report when unexpected things happen: # - debug(message) if it's only for debugging (user will see this if debugging is enabled) # - info(message) if it's not a bad thing # - error(message) if it's a bad thing and should cause the scan to abort # - fatal(message) if it's a horrible thing and should kill SpiderFoot completely if res['content'] is None: self.info(f"No SHODAN info found for {qry}") return None # Always process external data which is expected to be in a specific format # with try/except since we cannot trust the data is formatted as intended. try: return json.loads(res['content']) except Exception as e: self.error(f"Error processing JSON response from SHODAN: {e}") return None # Handle events sent to this module def handleEvent(self, event): # The three most used fields in SpiderFootEvent are: # event.eventType - the event type, e.g. INTERNET_NAME, IP_ADDRESS, etc. # event.module - the name of the module that generated the event, e.g. sfp_dnsresolve # event.data - the actual data, e.g. 127.0.0.1. This can sometimes be megabytes in size (e.g. a PDF) eventName = event.eventType eventData = event.data # Once we are in this state, return immediately. if self.errorState: return # Check if the module has already analysed this event data. if eventData in self.results: self.debug(f"Skipping {eventData}, already checked.") return # Add the event data to results dictionary to prevent duplicate queries. # If eventData might be something large, set the key to a hash # of the value instead of the value, to avoid memory abuse. self.results[eventData] = True if eventName == 'NETBLOCK_OWNER': # Note here an example of handling the netblocklookup option if not self.opts['netblocklookup']: return max_netblock = self.opts['maxnetblock'] net_size = IPNetwork(eventData).prefixlen if net_size < max_netblock: self.debug(f"Network size {net_size} bigger than permitted: {max_netblock}") return # When handling netblocks/subnets, assuming the user set # netblocklookup/subnetlookup to True, we need to expand it # to the IPs for looking up. qrylist = list() if eventName.startswith("NETBLOCK_"): for ipaddr in IPNetwork(eventData): qrylist.append(str(ipaddr)) self.results[str(ipaddr)] = True else: qrylist.append(eventData) for addr in qrylist: # Perform the query to the third party; in this case for each IP # being queried. rec = self.query(addr) # Handle the response being empty/failing if rec is None: continue # For netblocks, we need to create the IP address event so that # the threat intel event is more meaningful and linked to the # IP address within the network, not the whole network. if eventName == 'NETBLOCK_OWNER': # This is where the module generates an event for other modules # to process and is a fundamental part of the SpiderFoot architecture. # We are generating an event of type "IP_ADDRESS" here, the data being # the addr variable, the name of the module is the next argument # (self.__name__), and finally the event that is linked as the source # event of this event. This enables SpiderFoot to link events so users # can see what events generated other events, seeing a full chain of # discovery from their target to the data returned here. pevent = SpiderFootEvent("IP_ADDRESS", addr, self.__name__, event) # With the event created, we can now notify any other modules listening # for IP_ADDRESS events (which they define in their watchedEvents() # function). self.notifyListeners(pevent) elif eventName == 'NETBLOCK_MEMBER': pevent = SpiderFootEvent("AFFILIATE_IPADDR", addr, self.__name__, event) self.notifyListeners(pevent) else: # If the event received wasn't a netblock, then use that event # as the source event for later events. pevent = event # When querying a third party API, always ensure to generate # a RAW_RIR_DATA event. Note that here we are seeing the pevent # event as the source for this, since the IP address is actually # what was queried against the third party, not the netblock. # So now we have NETBLOCK_OWNER (event we received) -> IP_ADDRESS # (event we generated above) -> RAW_RIR_DATA (event from the third # party about the IP Address we queried). evt = SpiderFootEvent("RAW_RIR_DATA", str(rec), self.__name__, pevent) self.notifyListeners(evt) # Whenever operating in a loop, call this to check whether the user # requested the scan to be aborted. if self.checkForStop(): return # In some cases, you want to override the data source for the event # you're producing to be the data source of the event that you've # received. This is needed, for example, when the module is purely # extracting data from a received event, so the data source is not # actually this module, but the data source of the received event # itself! sfp_email is a good example, since it is purely looking # for e-mail addresses in received content, so an EMAILADDR event # should have a data source of whatever place the EMAILADDR was # actually found in. This is how you'd achieve that: if event.moduleDataSource: evt.moduleDataSource = event.moduleDataSource else: # This should never happen, but just to be safe since other # code might depend on this field existing and not being None. evt.moduleDataSource = "Unknown" # Note that we are using rec.get('os') instead of rec['os'] - this # means we won't get an exception if the 'os' key doesn't exist. In # general, you should always use .get() instead of accessing keys # directly in case the key doesn't exist. os = rec.get('os') if os: evt = SpiderFootEvent("OPERATING_SYSTEM", f"{os} ({addr})", self.__name__, pevent) self.notifyListeners(evt) # End of sfp_template class