主程序
#include "my_curl.h"
#include "check.h"
#include <string.h>
#include <set>
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <time.h>
using namespace std;
void GetCsdnBlogList(const string& uid, set<string>& setUrl);
int main(int argc, char* argv[])
{
set<string> setUrl;
if(argc > 1)
{
GetCsdnBlogList(argv[1], setUrl);
}
else
{
printf("两种工作方式:\n1. %s csdn_bloger_uid 访问CSDN博客列表\n", argv[0]);
printf("2. 默认访问本地网站列表文件site.txt指定的地址\n");
ifstream ifile;
ifile.open("site.txt");
if(ifile.is_open())
{
string strLine;
while(getline(ifile, strLine))
{
setUrl.insert(strLine);
}
ifile.close();
}
}
if(setUrl.size() <= 0)
{
return -1;
}
vector<string> vecUrl;
for (set<string>::iterator ite=setUrl.begin(); ite!=setUrl.end(); ite++)
{
vecUrl.push_back(*ite);
}
setUrl.clear();
int i = 0;
int j = 0;
CHttpClient http;
string strResult;
srand((unsigned int)time(NULL));
while(i<20)
{
j = rand()%vecUrl.size();
http.Get(vecUrl[j], strResult);
cout << ++i << "\t" << vecUrl[j] << endl;
}
return 0;
}
bool GetCsdnBlogArticle(const string& html, const string& uid, set<string>& setUrl)
{
const string strKeyWord = "link_title";
const string strKeyArticle= "article/details/";
const string strKeyEnd = "\">";
const string strArticle = "http://blog.csdn.net/" + uid+ "/article/details/";
const char* pBeg = NULL;
const char* pEnd = NULL;
char szBuf[64] = {0};
int nLen = sizeof(szBuf)-1;
size_t nCount = setUrl.size();
pEnd = pBeg = html.c_str();
while(pBeg = std::strstr(pBeg, strKeyWord.c_str()))
{
pBeg = std::strstr(pBeg, strKeyWord.c_str());
pBeg = std::strstr(pBeg, strKeyArticle.c_str());
pBeg += strKeyArticle.length();
pEnd = std::strstr(pBeg, strKeyEnd.c_str());
memset(szBuf, 0, sizeof(szBuf));
memcpy(szBuf, pBeg, (pEnd-pBeg)>nLen ? nLen:(pEnd-pBeg));
setUrl.insert(strArticle + szBuf);
cout << "\r" << szBuf;
}
return !(setUrl.size()==nCount);
}
void GetCsdnBlogList(const string& uid, set<string>& setUrl)
{
CHttpClient http;
string strUrl;
string str;
char szBuf[256];
const string strUsr = "http://blog.csdn.net/"+ uid + "/article/list/";
int i = 0;
cout << "抓到文章 -> \n";
do
{
sprintf_s(szBuf, "%s%d", strUsr.c_str(), ++i);
_CrtDumpMemoryLeaks();
http.Get(szBuf, str);
_CrtDumpMemoryLeaks();
}while(GetCsdnBlogArticle(str, uid, setUrl));
cout << "\n共抓取到文章" << setUrl.size() << "篇" << endl;
}
封装的curl类
#include "my_curl.h"
#include "curl/curl.h"
#include <string>
#pragma comment(lib, "ws2_32.lib")
#pragma comment(lib, "wldap32.lib")
#if _DEBUG
#pragma comment(lib, "libcurld.lib")
#else
#pragma comment(lib, "libcurl.lib")
#endif
CHttpClient::CHttpClient(void) :
m_bDebug(false)
{
}
CHttpClient::~CHttpClient(void)
{
}
static int OnDebug(CURL *, curl_infotype itype, char * pData, size_t size, void *)
{
if(itype == CURLINFO_TEXT)
{
//printf("[TEXT]%s\n", pData);
}
else if(itype == CURLINFO_HEADER_IN)
{
printf("[HEADER_IN]%s\n", pData);
}
else if(itype == CURLINFO_HEADER_OUT)
{
printf("[HEADER_OUT]%s\n", pData);
}
else if(itype == CURLINFO_DATA_IN)
{
printf("[DATA_IN]%s\n", pData);
}
else if(itype == CURLINFO_DATA_OUT)
{
printf("[DATA_OUT]%s\n", pData);
}
return 0;
}
static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)
{
std::string* str = dynamic_cast<std::string*>((std::string *)lpVoid);
if( NULL == str || NULL == buffer )
{
return -1;
}
char* pData = (char*)buffer;
str->append(pData, size * nmemb);
return nmemb;
}
int CHttpClient::Post(const std::string & strUrl, const std::string & strPost, std::string & strResponse)
{
CURLcode res;
CURL* curl = curl_easy_init();
if(NULL == curl)
{
return CURLE_FAILED_INIT;
}
if(m_bDebug)
{
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, OnDebug);
}
curl_easy_setopt(curl, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(curl, CURLOPT_POST, 1);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, strPost.c_str());
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
return res;
}
int CHttpClient::Get(const std::string & strUrl, std::string & strResponse)
{
CURLcode res;
CURL* curl = curl_easy_init();
if(NULL == curl)
{
return CURLE_FAILED_INIT;
}
if(m_bDebug)
{
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, OnDebug);
}
curl_easy_setopt(curl, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
/**
* 当多个线程都使用超时处理的时候,同时主线程中有sleep或是wait等操作。
* 如果不设置这个选项,libcurl将会发信号打断这个wait从而导致程序退出。
*/
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
return res;
}
int CHttpClient::Posts(const std::string & strUrl, const std::string & strPost, std::string & strResponse, const char * pCaPath)
{
CURLcode res;
CURL* curl = curl_easy_init();
if(NULL == curl)
{
return CURLE_FAILED_INIT;
}
if(m_bDebug)
{
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, OnDebug);
}
curl_easy_setopt(curl, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(curl, CURLOPT_POST, 1);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, strPost.c_str());
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
if(NULL == pCaPath)
{
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
}
else
{
//缺省情况就是PEM,所以无需设置,另外支持DER
//curl_easy_setopt(curl,CURLOPT_SSLCERTTYPE,"PEM");
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, true);
curl_easy_setopt(curl, CURLOPT_CAINFO, pCaPath);
}
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
return res;
}
int CHttpClient::Gets(const std::string & strUrl, std::string & strResponse, const char * pCaPath)
{
CURLcode res;
CURL* curl = curl_easy_init();
if(NULL == curl)
{
return CURLE_FAILED_INIT;
}
if(m_bDebug)
{
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, OnDebug);
}
curl_easy_setopt(curl, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
if(NULL == pCaPath)
{
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
}
else
{
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, true);
curl_easy_setopt(curl, CURLOPT_CAINFO, pCaPath);
}
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 3);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3);
res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
return res;
}
///////////////////////////////////////////////////////////////////////////////////////////////
void CHttpClient::SetDebug(bool bDebug)
{
m_bDebug = bDebug;
}
网友评论