一、环境搭建

windows 10
mongodb
nosqlbooster
python3.7.3

1.1mongodb server

mongodb 是一个流行的非关系型数据库，其采用文档格式来存储数据，具有较好的灵活性，用Python语言读写数据库非常简单方便，其社区版本可以免费使用，下载地址为https://www.mongodb.com/download-center/community

下载mongodb server

安装过程请注意

1.2 nosqlbooster

nosqlbooster是一个mongodb的可视化管理工具，是收费的，如果仅仅是增删改查的话，其免费的功能已够用了。
官网的下载地址为https://www.nosqlbooster.com/downloads

nosqlbooster下载界面

1.3 pymongo

pymongo是Python连接mongodb数据库的第三方包，可以使用pip进行安装

pip install pymongo

简单的使用可参考 https://pypi.org/project/pymongo/
文档 https://api.mongodb.com/python/current/

二、解析Nmap扫描结果

使用下面的命令会以xml格式来保存nmap的扫描结果
nmap -sS x.x.x.0/24 -oX xxx.xml
使用python的Beatusoup库来解析xml文件，将解析出来的结果存入Mongodb数据库，方便查询和利用。
代码如下
parse_xml.py

#-*- coding:utf-8 -*-
# author: wlj 
# time: 2020/2/28 10:15
# 解析nmap xml格式的扫描结果，存入mongodb数据库
import os
from bs4 import BeautifulSoup
from pymongo import MongoClient

#连接本地的mongodb server
mongo = MongoClient("localhost", 27017).asset.nmap_result

#解析nmap扫描结果的xml文件
def parse_xml(xml_filename):
    soup = BeautifulSoup(open(xml_filename,'r').read(),'lxml')
    #遍历所有的host标签
    for host in soup.find_all('host'):
        #找到存活的主机
        if host.status['state'] == 'up':
            #主机的IP地址
            ip = host.address['addr']
            #用于保存端口信息的列表
            ports=[]
            #遍历所有的port标签
            for port in host.ports.find_all('port'):
                #将port标签下的协议、端口号、状态、服务名称添加进ports
                ports.append({
                    'protocol':port['protocol'],
                    'portid': port['portid'],
                    'state': port.state['state'],
                    'service': port.service['name']
                })
            #写入mongodb
            mongo.update_one(
                {'ip':ip},
                {
                    '$set':{
                        'ports':ports
                    }
                },
                upsert=True #若该IP的记录不存在，将其创建，存在的话，对其更新
            )
            print(ip)

def main():
    #递归遍历当前目录
    for root,dirs,names in os.walk('.'):
        for name in names:
            #若文件以.xml结尾
            if name.endswith('.xml'):
                #构造完整的文件名
                filename = root + os.sep + name
                parse_xml(filename)

main()

结果

mongodb数据库

三、提取web站点的title

本节使用requests来获取所有的web站点的title和banner信息，代码如下。
get_web_title.py

#-*- coding:utf-8 -*-
# author:wlj
# time: 2020/2/28 13:27
# 从mongodb数据库中获取开放80端口的IP，请求主页，获取reponse和网站title
import requests,re,chardet
from requests.packages import urllib3
from pymongo import MongoClient
import threading
from queue import Queue
urllib3.disable_warnings()


#全局变量
#HTTP请求使用的headers
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}

#任务队列
task_queue = Queue()
#互斥量，用于控制线程对task_queue变量的访问
threadLock = threading.Lock()
#存储website信息的表
mongo_website = MongoClient("localhost", 27017).asset.website

#根据url获取网页的title
def get_web_title(url):
    try:
        if url.startswith('https'):#若为https协议
            r = requests.get(url,headers=headers,verify=False,timeout=5)
        else:#若为http协议
            r = requests.get(url,headers=headers,timeout=5)
    except:#出错的话直接返回
        return

    if r.status_code == 200:
        #使用正则表达式提取页面title
        title = re.findall(r'<title>(.*?)</title>',r.text)
        home_title = ''
        if title:
            #print(r.encoding)
            #使用chardet来推测页面使用的编码方式
            dect_encoding = chardet.detect(r.text.encode(r.encoding))['encoding']
            #print(dect_encoding)
            
            #提取页面的title
            try:
                home_title = title[0].encode(r.encoding).decode(dect_encoding)
            except:
                home_title = title[0]
            else:
                pass
        #将页面信息存入mongodb 
        mongo_website.update_one(
            {'url':url},#url
            {'$set':{
                'title':home_title,#主页title
                'response_headers':r.headers #返回的reponse头
            }},
            upsert=True #若该记录不存在，则插入，存在的话更新
        )
        print(url,home_title)
                
#自定线程类    
class myThread (threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            # 获取锁，用于线程同步
            threadLock.acquire()
            #任务队列为空的话
            if task_queue.empty():
                # 释放锁，开启下一个线程
                threadLock.release()
                #跳出循环
                break
            
            else:#任务队列不为空的话
                #从队列中的取出一个url
                url = task_queue.get()    
                #输出当前队列的大小
                print('queue size',task_queue.qsize())           
                # 释放锁，开启下一个线程
                threadLock.release()
                #获取url的title
                get_web_title(url)

#主函数
def main():
    #namp_result表
    mongo = MongoClient("localhost", 27017).asset.nmap_result
    #从mongodb中提取中所有开放80端口的ip，构造url
    for url in ['http://'+x['ip']  for x in mongo.find({'ports.portid':'80'},{'ip':1})]:
        #将这些url插入队列
        task_queue.put(url)
    
    #线程池
    threads = []
    #创建线程
    for i in range(20):
        t = myThread()
        t.start()
        #将新创建的线程加入线程池
        threads.append(t)
    #等待所有的线程执行完毕后，继续执行主函数
    for t in threads:
        t.join()

    print('completed!')

main()

结果

web站点