www.gusucode.com > 网页爬虫VC++源码下载-源码程序 > 网页爬虫VC++源码下载-源码程序\code\webpageloader_SourceCode\DownloadFile.cpp
//Download by http://www.NewXing.com // DownloadFile.cpp: implementation of the CDownloadFile class. // ////////////////////////////////////////////////////////////////////// #include "stdafx.h" #include "WebPageLoader.h" #include "DownloadFile.h" #ifdef _DEBUG #undef THIS_FILE static char THIS_FILE[]=__FILE__; #define new DEBUG_NEW #endif ////////////////////////////////////////////////////////////////////// // // CDownloadFile // ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// // Construction/Destruction CDownloadFile::CDownloadFile() { m_pFile = NULL; m_State = FILESTATE_WAITING; m_iDownloadAttempts = 0; m_nLevel = 0; m_iLastImage = -1; m_lBytesDownloaded = 0; m_bSkipRequest = FALSE; } CDownloadFile::~CDownloadFile() { } ////////////////////////////////////////////////////////////////////// // Operations BOOL CDownloadFile::Create(LPCTSTR szURL, BOOL bIsImage, short nLevel/*=1*/) { ASSERT(AfxIsValidString(szURL)); if( szURL==NULL ) return FALSE; m_sURL = szURL; m_bIsImage = bIsImage; m_nLevel = nLevel; m_sURL.Replace("&", "&"); return TRUE; }; // Construct filename and local folder structures CString CDownloadFile::GetTargetFilename(CSession *pSession) { ASSERT_VALID(pSession); CString sPath; CString sFilename; int pos; sPath = pSession->m_Settings.m_sDownloadPath; ADDBACKSLASH(sPath); sFilename = m_sURL; sFilename.Replace(_T("\\"), _T("/")); // Mask off some traditional URL extensions pos = sFilename.Find(_T('?')); if( pos>=0 ) sFilename = sFilename.Left(pos); pos = sFilename.Find(_T('#')); if( pos>=0 ) sFilename = sFilename.Left(pos); // Get to the bare filename pos = sFilename.ReverseFind(_T('/')); if( pos<0 ) return CString(); sFilename = sFilename.Mid(pos+1); if( sFilename.IsEmpty() ) sFilename += _T("index.html"); sFilename.Replace(_T("%20"),_T(" ")); sFilename.Replace(_T("%7E"),_T("-")); sFilename.Replace(_T("%7e"),_T("-")); for( int x1=0x20; x1<=0x2F; x1++ ) { CString sTemp; sTemp.Format(_T("%%%02x"), x1); sFilename.Replace(sTemp, _T("")); sTemp.Format(_T("%%%02X"), x1); sFilename.Replace(sTemp, _T("")); } sFilename.TrimRight(); sFilename.TrimRight("-.~'"); if( sFilename.GetLength() > 2 && sFilename.GetAt(1) == '.' && _ttoi(sFilename) > 0 ) { CString sTemp = (LPCTSTR)sFilename; sFilename = _T("0") + sTemp; } // If "append to target path" is requested, we transform the // URL and append it to the target filename. if( pSession->m_Settings.m_bAppendSourcePath ) { CString sPart = m_sURL; int pos = sPart.Find(_T("//")); if( pos>=0 ) sPart = sPart.Mid(pos + 2); sPart.TrimLeft("/ "); sPart.Replace(_T('/'),_T('\\')); if( pSession->m_Settings.m_bPrettyPath ) PrettyPath(sPart, sFilename); sPart.Replace(_T("%20"),_T(" ")); sPart.Replace(_T("%21"),_T(" ")); sPart.Replace(_T("%22"),_T(" ")); sPart.Replace(_T("%23"),_T(" ")); sPart.Replace(_T("%7E"),_T("-")); sPart.Replace(_T("%7e"),_T("-")); sPart.Replace(_T("%3A"),_T("")); sPart.Replace(_T("%3a"),_T("")); sPart.Remove(_T('*')); sPart.Remove(_T('?')); sPart.Remove(_T(':')); sPart.Remove(_T('<')); sPart.Remove(_T('>')); sPart.Remove(_T('"')); sPart.Remove(_T('\'')); sPart.Replace(_T("%2b"),_T("&")); sPart.Replace(_T("%2B"),_T("&")); sPart.Replace(_T("%2C"),_T(",")); sPart.Replace(_T("%2c"),_T(",")); sPart.Replace(_T("%26"),_T("&")); sPart.Replace(_T("%27"),_T("\'")); sPart.TrimRight(); for( int x2=0x20; x2<=0x2F; x2++ ) { CString sTemp; sTemp.Format(_T("%%%02x"), x2); sPart.Replace(sTemp, _T("")); sTemp.Format(_T("%%%02X"), x2); sPart.Replace(sTemp, _T("")); } pos = sPart.ReverseFind(_T('\\')); if( pos > 0 ) { sPath += sPart.Left(pos); } ADDBACKSLASH(sPath); } CDir dir; dir.Create(sPath); // If file does not exists, then we can safely create a new... if( !BfxFileExists(sPath + sFilename) ) return sPath + sFilename; // If skip duplicates, then skip this one... if( pSession->m_Settings.m_Duplicates == DUP_SKIP ) { if( BfxHasValidExtension(sFilename, HTML_EXTENSIONS) ) return sPath + sFilename; return CString(); } // If we overwrite existing file, then file routine should do this... if( pSession->m_Settings.m_Duplicates == DUP_OVERWRITE ) return sPath + sFilename; // Rename scenario... horror! We do a simple increment from a-z. If file // does not exists, then use it. Otherwise fail... CString sFile, sExt; // First we need to seperate the filename and extension... pos = sFilename.ReverseFind(_T('.')); if( pos<0 ) return CString(); sFile = sFilename.Left(pos); sExt = sFilename.Mid(pos); // Then scan for a free filename for( TCHAR c=_T('a'); c<=_T('z'); c++ ) { CString sNewFilename; if( pSession->m_Settings.m_bPrefixRename ) sNewFilename.Format(_T("%c%s%s"), c, (LPCTSTR)sFile, (LPCTSTR)sExt); else sNewFilename.Format(_T("%s%c%s"), (LPCTSTR)sFile, c, (LPCTSTR)sExt); if( !BfxFileExists(sPath + sNewFilename) ) return sPath + sNewFilename; }; return CString(); }; // Downloads the file, store it and parse it. BOOL CDownloadFile::Download(CSession *pSession, CInternetSession *pInet, int iPosition) { TRACE("Download: %s\n", (LPCTSTR)m_sURL); ASSERT_VALID(pSession); ASSERT_VALID(pInet); CString sServer; CString sPage; DWORD dwType; INTERNET_PORT nPort; AfxParseURL( m_sURL, dwType, sServer, sPage, nPort ); CString sFilename; // Validate output { // First check if the page should be created at all... CSingleLock lock( *pSession, TRUE ); if( ValidateURL( pSession, sServer, sPage )==FALSE ) { return FALSE; } // Now get the filename (and create the local file)... sFilename = GetTargetFilename(pSession); if( sFilename.IsEmpty() ) { CSingleLock lock( *pSession, TRUE ); m_State = FILESTATE_ALREADYTHERE; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_NOFILE, (LPCTSTR)sPage); return FALSE; } } // Make the download happen... BOOL bRes = DoDownload(pSession, pInet, sServer, sPage, nPort, dwType, sFilename ); if( bRes ) bRes = ParseFile(pSession, sFilename); // Delete it again if we don't want it... { CSingleLock lock( *pSession, TRUE ); if( pSession->m_Settings.m_bDontKeepHTML ) { if( !m_bIsImage ) ::DeleteFile(sFilename); // Delete empty folder for( int i = 0; i < 10; i++ ) { int iPos = sFilename.ReverseFind('\\'); if( iPos < 0 ) break; sFilename = sFilename.Left(iPos); int cchDir = pSession->m_OrigSettings.m_sDownloadPath.GetLength(); if( sFilename.Left(cchDir) == pSession->m_OrigSettings.m_sDownloadPath && sFilename.GetLength() > cchDir ) { BOOL bRes = ::RemoveDirectory(sFilename); if( !bRes && ::GetLastError() != ERROR_DIR_NOT_EMPTY ) { bool bFound = false; for( int i=0; !bFound && i<pSession->m_aDelFolders.GetSize(); i++ ) if( pSession->m_aDelFolders[i] == sFilename ) bFound = true; if( !bFound ) pSession->m_aDelFolders.Add(sFilename); break; } } } } } // Randomize files in session after first download { CSingleLock lock( *pSession, TRUE ); if( iPosition == 1 && pSession->m_Settings.m_bRandomize ) RandomizeSession(pSession); } // Download complete (or failed) - refresh UI { CSingleLock lock( *pSession, TRUE ); ::PostMessage(pSession->m_Preferences->m_hwndFrame, WM_REFRESHITEMS, 0, 0); } return bRes; }; // Does the actual file download from the internet. BOOL CDownloadFile::DoDownload( CSession *pSession, CInternetSession *pInet, CString &sServer, CString &sPage, INTERNET_PORT nPort, DWORD dwType, LPCTSTR strFilename) { ASSERT_VALID(pSession); ASSERT_VALID(pInet); ASSERT(strFilename); CHttpConnection* pServer = NULL; CHttpFile *pInetFile = NULL; CFile* pGenFile = NULL; BOOL bDownloadOK = FALSE; // This is the default! DWORD dwSessionID = pSession->m_iUniqueID; if( !BfxIsValidSession(pSession) ) goto done; m_iDownloadAttempts++; m_State = FILESTATE_CONNECTING; m_lBytesDownloaded = 0; ::PostMessage(pSession->m_Preferences->m_hwndFrame, WM_REFRESHNODES, dwSessionID, 0); TRY { DWORD dwRet; TCHAR szHeaders[1024] = { 0 }; LPCTSTR szUsername = NULL; LPCTSTR szPassword = NULL; { CSingleLock lock( *pSession, TRUE ); // Look for a valid username/password... if( pSession->m_sUsername.GetLength() > 0 ) szUsername = pSession->m_sUsername; if( pSession->m_sPassword.GetLength() > 0 ) szPassword = pSession->m_sPassword; } if( pSession->m_Preferences->m_bCompatibilityMode ) { CString sReferer; sReferer.Format(_T("Referer: http://%s/index.html\r\n"), sServer); _tcscpy( szHeaders, _T("Accept: image/gif, image/x-xbitmap, image/jpeg, */*\r\n") ); _tcscat( szHeaders, _T("Accept-Language: en-us\r\n") ); _tcscat( szHeaders, sReferer ); _tcscat( szHeaders, _T("Connection: Keep-Alive\r\n") ); } // Open regular file if it's local if( dwType == AFX_INET_SERVICE_FILE ) { CSingleLock lock( *pSession, TRUE ); m_State = FILESTATE_DOWNLOADING; pGenFile = new CFile; CString sFilename = m_sURL; sFilename.Replace(_T("file:///"), _T("")); pGenFile->Open(sFilename, CFile::modeRead); goto read_loop; } // Open HTTP connection pServer = pInet->GetHttpConnection(sServer, nPort, szUsername, szPassword); ASSERT_VALID(pServer); if( pServer == NULL ) { CSingleLock lock( *pSession, TRUE ); // It failed. It should have thrown an exception! m_State = FILESTATE_ERROR; pSession->m_Info.m_nFilesFailed++; pSession->Log(LOGTYPE_ERROR, IDS_LOG_SERVERFAIL, (LPCTSTR)sServer); goto done; } // Set timeout options pServer->SetOption(INTERNET_OPTION_SEND_TIMEOUT, 30 * 1000); pServer->SetOption(INTERNET_OPTION_RECEIVE_TIMEOUT, 40 * 1000); // Open file... pInetFile = pServer->OpenRequest(CHttpConnection::HTTP_VERB_GET, sPage, NULL, 1, NULL, NULL, INTERNET_FLAG_EXISTING_CONNECT | INTERNET_FLAG_RELOAD); ASSERT_VALID(pInetFile); if( pInetFile==NULL ) { CSingleLock lock( *pSession, TRUE ); m_State = FILESTATE_ERROR; pSession->m_Info.m_nFilesFailed++; pSession->Log(LOGTYPE_ERROR, IDS_LOG_FILENOTFOUND, (LPCTSTR)sPage); goto done; } { CSingleLock lock( *pSession, TRUE ); pSession->m_pCurrentDownloadFile = pInetFile; m_pFile = pInetFile; } pInetFile->AddRequestHeaders(szHeaders, HTTP_ADDREQ_FLAG_REPLACE | HTTP_ADDREQ_FLAG_ADD); pInetFile->SendRequest(); pInetFile->QueryInfoStatusCode(dwRet); if( dwRet == HTTP_STATUS_OK ) { // // Ok, we've got a file connection. Now download... // if( !pSession->m_Preferences->m_bCompatibilityMode ) { CString strNewLocation; pInetFile->QueryInfo(HTTP_QUERY_RAW_HEADERS_CRLF, strNewLocation); int nPlace = strNewLocation.Find(_T("Location: ")); if( nPlace >= 0 ) { strNewLocation = strNewLocation.Mid(nPlace + 10); nPlace = strNewLocation.Find('\n'); if( nPlace > 0 ) sPage = strNewLocation.Left(nPlace); } } { CSingleLock lock( *pSession, TRUE ); m_State = FILESTATE_DOWNLOADING; if( pSession->m_Settings.m_bUseFileSizeFilter && !pSession->m_Preferences->m_bCompatibilityMode ) { // Let's just see the how big this file is... DWORD dwLen = 4; DWORD dwTotalSize = 0; pInetFile->QueryInfo(HTTP_QUERY_CONTENT_LENGTH | HTTP_QUERY_FLAG_NUMBER, &dwTotalSize, &dwLen, NULL ); // If it is an image and we have file-size filter on, // then check now if( m_bIsImage && (dwTotalSize>0) && (long)(dwTotalSize/1024) < pSession->m_Settings.m_iFileSizeFilter ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_NOTBIGENOUGH, (LPCTSTR)sPage); goto done; } } } pGenFile = pInetFile; read_loop: ::PostMessage(pSession->m_Preferences->m_hwndFrame, WM_REFRESHITEMS, 0, 0); ::PostMessage(pSession->m_Preferences->m_hwndFrame, WM_REFRESHNODES, dwSessionID, 0); // Now download and write out the file... CStdioFile f; BOOL bOk; bOk = f.Open(strFilename, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary|CFile::shareExclusive); if( bOk ) { // // Now download file and write to disk // #define DOWNLOADBUFFER 2048 BYTE szBuff[DOWNLOADBUFFER]; // Read a chunk UINT nRead = pGenFile->Read(szBuff, DOWNLOADBUFFER-1); // We do some initial test on the first buffer downloaded... if( (nRead > 0) && (nRead < DOWNLOADBUFFER) ) { CSingleLock lock( *pSession, TRUE ); if( ValidateFirstBuffer(pSession, sPage, szBuff, nRead)==FALSE ) { // Error, file did not match or was invalid. // A log should have been produced by ValidateFirstBuffer() // Do not read any further, and fail #ifdef _DEBUG CFile f; f.Open("Q:\\Download\\dump.txt", CFile::modeCreate | CFile::modeWrite); f.Write(szBuff, nRead); f.Close(); #endif nRead = 0; bOk = FALSE; } } while( (nRead > 0) && (nRead < DOWNLOADBUFFER) ) { #ifdef _DEBUG //::Sleep(200); // Simulate net lag!!! #endif // // Write to file // #define NUM_WRITE_RETRIES 4 bool bWriteDone = false; int nWriteRetries = 0; while( !bWriteDone && nWriteRetries < NUM_WRITE_RETRIES ) { TRY { f.Write(szBuff,nRead); bWriteDone = true; } CATCH_ALL(e) { // Catch write error and pause thread, then retry... ::Sleep(1000); nWriteRetries++; } END_CATCH_ALL } if( !bWriteDone ) { m_State = FILESTATE_ERROR; pSession->m_Info.m_nFilesFailed++; pSession->SafeLog(LOGTYPE_LOG, IDS_LOG_FILEWRITE, (LPCTSTR)strFilename); bOk = FALSE; break; } // // Check for new state // { CSingleLock lock( *pSession, TRUE ); // Do some statistics... m_lBytesDownloaded += nRead; pSession->m_Info.m_lBytesDownloaded += nRead; // Did someone request us to stop? if( pSession->m_bSleepRequest || pSession->m_bStopRequest || pSession->m_bKillRequest ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_SKIPPED, (LPCTSTR)sPage); bOk = FALSE; break; } if( m_bSkipRequest ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_SKIPPED, (LPCTSTR)sPage); bOk = FALSE; break; } } // Read next buffer... nRead = pGenFile->Read(szBuff, DOWNLOADBUFFER-1); } m_pFile = NULL; f.Close(); // // Make sure it's a valid file // if( bOk ) { CSingleLock lock( *pSession, TRUE ); bOk = ValidateDownloadedFile(pSession, strFilename, sPage); } if( bOk ) { CSingleLock lock( *pSession, TRUE ); // SUCCESS! m_State = FILESTATE_DONE; pSession->m_Info.m_nFilesDownloaded++; pSession->m_Preferences->m_nFilesDownloaded++; pSession->m_Preferences->m_lBytesDownloaded += m_lBytesDownloaded; pSession->Log(LOGTYPE_LOG, IDS_LOG_FILEDOWNLOADED, (LPCTSTR)sPage); bDownloadOK = TRUE; } else { // If the download fail, we remove the (invalid) file! ::DeleteFile(strFilename); } } else { CSingleLock lock( *pSession, TRUE ); // File Create error m_State = FILESTATE_ERROR; pSession->m_Info.m_nFilesFailed++; pSession->Log(LOGTYPE_LOG, IDS_LOG_FILECREATE, (LPCTSTR)strFilename); } } else { // // Error! // pSession->m_Info.m_nFilesFailed++; switch( dwRet ) { case HTTP_STATUS_NOT_FOUND: { m_State = FILESTATE_ERROR; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_FILENOTFOUND, (LPCTSTR)sPage); }; break; case HTTP_STATUS_MOVED: case HTTP_STATUS_REDIRECT: case HTTP_STATUS_REDIRECT_METHOD: { m_State = FILESTATE_SKIPPED; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_FILEMOVED, (LPCTSTR)sPage); }; break; case HTTP_STATUS_DENIED: case HTTP_STATUS_FORBIDDEN: case HTTP_STATUS_PAYMENT_REQ: case HTTP_STATUS_USE_PROXY: case HTTP_STATUS_PROXY_AUTH_REQ: { m_State = FILESTATE_ERROR; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_ACCESSDENIED, (LPCTSTR)sPage); }; break; case HTTP_STATUS_BAD_REQUEST: case HTTP_STATUS_SERVER_ERROR: case HTTP_STATUS_UNSUPPORTED_MEDIA: case HTTP_STATUS_VERSION_NOT_SUP: { m_State = FILESTATE_ERROR; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_SERVERERROR, (LPCTSTR)sPage); }; break; case HTTP_STATUS_REQUEST_TIMEOUT: case HTTP_STATUS_GATEWAY_TIMEOUT: case HTTP_STATUS_SERVICE_UNAVAIL: { m_State = FILESTATE_BROKEN; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_TIMEOUT, (LPCTSTR)sPage); }; break; case HTTP_STATUS_PARTIAL: case HTTP_STATUS_PRECOND_FAILED: case HTTP_STATUS_PARTIAL_CONTENT: case HTTP_STATUS_REQUEST_TOO_LARGE: { m_State = FILESTATE_BROKEN; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_INCOMPLETE, (LPCTSTR)sPage); }; break; default: { m_State = FILESTATE_ERROR; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_GENERALERROR, (LPCTSTR)sPage); } break; } } } CATCH_ALL( e ) { // Catch errors from WinInet m_State = FILESTATE_ERROR; m_pFile = NULL; pSession->m_Info.m_nFilesFailed++; pSession->SafeLog(LOGTYPE_ERROR, IDS_LOG_GENERALERROR, (LPCTSTR)sPage); // Delete the file anyway... ::DeleteFile(strFilename); } END_CATCH_ALL done: // Clean up local file if( dwType == AFX_INET_SERVICE_FILE ) { TRY { m_pFile = NULL; pGenFile->Close(); } CATCH_ALL( e ) { pGenFile->Abort(); } END_CATCH_ALL delete pGenFile; pGenFile = NULL; pInetFile = NULL; pSession->m_pCurrentDownloadFile = NULL; } // Clean up file and internet mess if( pInetFile != NULL ) { ASSERT_VALID(pInetFile); TRY { m_pFile = NULL; pInetFile->Close(); } CATCH_ALL( e ) { pInetFile->Abort(); } END_CATCH_ALL delete pInetFile; pGenFile = NULL; pInetFile = NULL; pSession->m_pCurrentDownloadFile = NULL; } // Close connection if( pServer!=NULL ) { ASSERT_VALID(pServer); TRY { pServer->Close(); } CATCH_ALL( e ) { } END_CATCH_ALL delete pServer; pServer = NULL; } // Return return bDownloadOK; }; BOOL CDownloadFile::ValidateFirstBuffer(CSession *pSession, LPCTSTR pstrFile, LPBYTE pBuffer, DWORD dwSize) // Validate the first downloaded buffer (usually 1024 bytes). // We check for common problems, such as empty page returned, images // failed to load (returns HTML page with error message) or animated gif // banners. { ASSERT_VALID(pSession); ASSERT(pstrFile); ASSERT(pBuffer); LPCTSTR pstrBuffer = (LPCTSTR)pBuffer; ASSERT(::IsBadReadPtr(pstrBuffer,dwSize)==FALSE); if( ::IsBadReadPtr(pstrBuffer,dwSize) ) return FALSE; // Not big enough to be a valid file at all! if( dwSize < 8 ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_LOG, IDS_LOG_EMPTYFILE, pstrFile); return FALSE; } // Only images are really checked from this point... // We assume everything else are valid HTML/download files. if( !m_bIsImage ) return TRUE; if( _tcsstr(pstrFile, ".txt") != NULL ) return TRUE; // Copy the buffer into a string. Make sure we // do nothing wrong and pad 0-bytes with spaces. CString s; LPTSTR p = s.GetBuffer(dwSize+1); for( DWORD i=0; i<dwSize; i++ ) { p[i] = ( pstrBuffer[i]=='\0' ? _T(' ') : pstrBuffer[i] ); } p[dwSize] = _T('\0'); s.ReleaseBuffer(dwSize+1); s.MakeLower(); if( s.Find(_T("<html>")) >= 0 || s.Find(_T("<head>")) >= 0 || s.Find(_T("<body ")) >= 0 || s.Find(_T("<frameset ")) >= 0 || s.Find(_T("<script")) >= 0 || s.Find(_T("<meta ")) >= 0 ) { // Error returned in html format (e.g. "image not downloaded" etc) m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_LOG, IDS_LOG_BADIMAGE, pstrFile); return FALSE; } // Is it an executable (virus)? if( s.Find("mz") == 0 ) { // Error returned in html format (e.g. "image not downloaded" etc) m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_LOG, IDS_LOG_BADIMAGE, pstrFile); return FALSE; } // It's a GIF. Check for animated GIFs -> banner! if( pSession->m_Type == TYPE_HTMLSCAN && pSession->m_Settings.m_bSkipBanners ) { if( (s.Find(_T("gif98a")) == 0 || s.Find(_T("gif89a")) == 0) && s.Find(_T("netscape2.0")) > 0 ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_LOG, IDS_LOG_BANNER, pstrFile); return FALSE; }; }; // Well, couldn't find anything wrong. Let's continue... return TRUE; }; BOOL CDownloadFile::ValidateURL(CSession *pSession, LPCTSTR strServer, LPCTSTR strPage) // Validate that the download URL is valid. // If it's an obvious banner url, then refuse. { ASSERT_VALID(pSession); ASSERT(strServer); ASSERT(strPage); static LPCTSTR szBannerProviders[] = { _T("doubleclick"), _T("sitemeter"), _T("adclick"), _T("ban/"), _T("ads/"), _T("/ad."), _T(".ad/"), _T("click2net"), _T("click-"), _T("click."), _T("click/"), _T("exchange"), _T("newads"), _T("adswap"), _T("paid"), _T("pay"), _T("thumb"), _T("thmb"), _T("/thn_"), _T("/tn_"), _T("/th_"), _T("/TN_"), _T("/TH_"), _T("_t."), _T("/t/"), _T("/th/"), _T("/tn/"), _T("mini/"), _T("avatar/"), _T("/small"), _T("_logo."), _T("join"), _T("banner"), _T("button"), _T("counter"), _T("javascript"), _T("vbscript"), _T("blank"), _T("spacer"), _T("menu-"), NULL }; static LPCTSTR szImageProviders[] = { _T("imagevenue"), _T("imagehosting"), _T("imagehigh"), _T("imagepile"), _T("imageshack"), _T("jpghosting"), _T("freeimagehosting"), _T("photobucket"), _T("picsserver"), _T("picserver"), NULL }; CString sServer = strServer; CString sPage = strPage; CString sServerLower = sServer; CString sPageLower = sPage; sServerLower.MakeLower(); sPageLower.MakeLower(); CString sUrlLower = pSession->m_sURL; sUrlLower.MakeLower(); // Populate the image-provider and banner-provider lists if( pSession->m_Preferences->m_aBannerProviders.GetSize() == 0 ) { LPCTSTR* p; p = szBannerProviders; while( *p!=NULL ) { pSession->m_Preferences->m_aBannerProviders.Add(CString(*p)); p++; } p = szImageProviders; while( *p!=NULL ) { pSession->m_Preferences->m_aImageProviders.Add(CString(*p)); p++; } // Load from .ini file TCHAR szIniFile[MAX_PATH] = { 0 }; ::GetModuleFileName(NULL, szIniFile, MAX_PATH); _tcscpy(_tcsrchr(szIniFile, '.'), _T(".ini")); CString sKey; int i; for( i=1; ; i++ ) { TCHAR szValue[200] = { 0 }; sKey.Format(_T("%d"), i); ::GetPrivateProfileString(_T("BannerProviders"), sKey, _T(""), szValue, 199, szIniFile); if( _tcslen(szValue) == 0 ) break; pSession->m_Preferences->m_aBannerProviders.Add(CString(szValue)); } for( i=1; ; i++ ) { TCHAR szValue[200] = { 0 }; sKey.Format(_T("%d"), i); ::GetPrivateProfileString(_T("ImageProviders"), sKey, _T(""), szValue, 199, szIniFile); if( _tcslen(szValue) == 0 ) break; pSession->m_Preferences->m_aImageProviders.Add(CString(szValue)); } } // Known image-provider? Then ignore domain limits set by configuration. BOOL bImageProviderHost = FALSE; for( int i=0; i<pSession->m_Preferences->m_aImageProviders.GetSize(); i++ ) { if( sServerLower.Find(pSession->m_Preferences->m_aImageProviders.GetAt(i)) >= 0 ) { bImageProviderHost = TRUE; } } // Are we stepping outside the original session domain? if( !bImageProviderHost && pSession->m_Settings.m_bPreventOutsideDomain ) { if( sServer.CompareNoCase(pSession->m_Info.m_sServer) != 0 ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_OUTSIDEDOMAIN, (LPCTSTR)sPage); return FALSE; } } // Are we stepping outside the original session URL? if( !bImageProviderHost && pSession->m_Settings.m_bPreventOutsideURL ) { int iPos; CString sFirst = strPage; CString sSecond = sPage; iPos = sFirst.ReverseFind('/'); if( iPos > 0 ) sFirst = sFirst.Left(iPos); iPos = sSecond.ReverseFind('/'); if( iPos > 0 ) sSecond = sSecond.Left(iPos); if( sServer.CompareNoCase(pSession->m_Info.m_sServer) != 0 || sFirst != sSecond ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_OUTSIDEURL, (LPCTSTR)sPage); return FALSE; } } // Can we recognize a banner provider? if( pSession->m_Type == TYPE_HTMLSCAN && pSession->m_Settings.m_bSkipBanners ) { for( int i=0; i<pSession->m_Preferences->m_aBannerProviders.GetSize(); i++ ) { if( sPageLower.Find(pSession->m_Preferences->m_aBannerProviders.GetAt(i)) >= 0 && sUrlLower.Find(pSession->m_Preferences->m_aBannerProviders.GetAt(i)) < 0 ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_BANNER, (LPCTSTR)sPage); return FALSE; // Found a banner provider! } } } // Does it have a "must contain" clause? // BUG: We do this on the page (not the filename, but the whole // after-server-is-appended string) if( m_bIsImage ) { if( pSession->m_Settings.m_bUseFileNameFilter ) { CString sPage = strPage; CString sFilter = pSession->m_Settings.m_sFileNameFilter; sPage.MakeUpper(); sFilter.MakeUpper(); // Match with pattern... if( MatchPatterns(sPage, sFilter) == FALSE ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_NOPATTERNMATCH, (LPCTSTR)strPage); return FALSE; } }; } // Does it have an exclude filter? if( pSession->m_Settings.m_bUseExcludeFilter && !pSession->m_Settings.m_sExcludeFilter.IsEmpty() ) { CString sPage = strPage; CString sFilter = pSession->m_Settings.m_sExcludeFilter; sPage.MakeUpper(); sFilter.MakeUpper(); // Match with pattern... if( MatchPatterns(sPage, sFilter) == TRUE ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_NOPATTERNMATCH, (LPCTSTR)strPage); return FALSE; } } return TRUE; }; BOOL CDownloadFile::ValidateDownloadedFile(CSession *pSession, LPCTSTR strFilename, LPCTSTR strURL) // Make some basic tests on the completely downloaded file. // We should see if the download was broken or too small. { BOOL bOk = TRUE; long dwTotalSize = ::BfxGetFileSize(strFilename); // Check if file was stopped/paused during download if( bOk ) { if( pSession->m_bSleepRequest || pSession->m_bStopRequest || pSession->m_bKillRequest ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_STOPPED, strURL); bOk = FALSE; } } // Check if file was downloaded at all? if( bOk ) { if( dwTotalSize < 8 ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesFailed++; pSession->Log(LOGTYPE_ERROR, IDS_LOG_EMPTYFILE, strURL); bOk = FALSE; } } // Check if file has valid size. If file-size filter is on, we // check against this. if( bOk ) { if( m_bIsImage && pSession->m_Settings.m_bUseFileSizeFilter && (dwTotalSize/1024) < pSession->m_Settings.m_iFileSizeFilter ) { m_State = FILESTATE_SKIPPED; pSession->m_Info.m_nFilesSkipped++; pSession->Log(LOGTYPE_WARNING, IDS_LOG_NOTBIGENOUGH, strURL); bOk = FALSE; } } return bOk; }; BOOL CDownloadFile::ParseFile(CSession *pSession, LPCTSTR strFilename) // This function parses an HTML file for links, images and stuff. // WILL LOCK THE SESSION RIGHT BEFORE USING IT! { TRACE("Parse File: %s\n", strFilename); // Make sure file has the right extension, so it // can be parsed. // We can only parse HTML content... if( !BfxHasValidExtension(strFilename, HTML_EXTENSIONS) ) return TRUE; // Read the file into a large string buffer CStdioFile f; BOOL bRes = f.Open(strFilename, CFile::modeRead|CFile::typeText); if( !bRes ) return TRUE; CString sTxt, s; while( f.ReadString(s) ) sTxt += s; f.Close(); sTxt.TrimLeft(); if( sTxt.IsEmpty() ) return FALSE; CString sLowerTxt = sTxt; sLowerTxt.MakeLower(); sLowerTxt.Replace(_T('\t'),_T(' ')); CString sValue; int pos = 0; int endpos = 0; while( FindToken(sTxt, sLowerTxt, pos, sValue, endpos) ) { sValue = ExtractURL(sValue,m_sURL); AddUrl(pSession, sValue); pos = endpos; } if( CrackCode(sTxt, sValue) ) { sValue = ExtractURL(sValue,m_sURL); AddUrl(pSession, sValue); } return TRUE; } BOOL CDownloadFile::AddUrl(CSession *pSession, CString& sValue) { sValue.TrimRight(); if( !sValue.IsEmpty() ) { // We have a download URL string bool bAdd = false; bool bIsHTML = true; // Check if we want to add this kind of file if( BfxHasValidExtension(sValue, HTML_EXTENSIONS) ) { bAdd = true; } else if( BfxHasValidExtension(sValue, pSession->m_Settings.m_sFileExtensions) ) { bAdd = true; bIsHTML = false; } // Add this item? if( bAdd ) { CSingleLock lock(*pSession, TRUE); if( pSession->m_Settings.m_bPreventRepeating ) { if( pSession->m_Files.ItemExists(sValue) ) bAdd = false; } // Make sure we don't nest too deep if( m_nLevel >= pSession->m_Settings.m_nDownloadLevel ) bAdd = false; if( bIsHTML && pSession->m_Settings.m_bDontKeepHTML && m_nLevel + 1 >= pSession->m_Settings.m_nDownloadLevel ) bAdd = false; } if( bAdd ) { // Is it an image? BOOL bIsImage = BfxHasValidExtension(sValue, pSession->m_Settings.m_sFileExtensions); if( sValue.Find(".cgi") >= 0 ) bIsImage = FALSE; // Yes, it was a good file. Add it to queue... CDownloadFile *pInetFile = new CDownloadFile; pInetFile->Create( sValue, bIsImage, m_nLevel+1); CSingleLock lock(*pSession, TRUE); pSession->m_Files.AddTail(pInetFile); }; }; return TRUE; }; BOOL CDownloadFile::CrackCode(CString &sTxt, CString &sResult) { sResult = ""; if( sTxt.Find("return lD(") > 0 ) { int posStart = sTxt.Find("return lD(") + 10 + 1; int posEnd = sTxt.Find("'", posStart); if( posStart < 0 || posEnd < 0 ) return FALSE; CString sCodedUrl = sTxt.Mid(posStart, posEnd - posStart); int iOffset = sCodedUrl.GetAt(sCodedUrl.GetLength()-1) - '0'; for( int i = 0; i < sCodedUrl.GetLength()-1; i++ ) { CHAR ch = sCodedUrl.GetAt(i) - iOffset; sResult += ch; } return TRUE; } return FALSE; }; BOOL CDownloadFile::FindToken(CString &sTxt, CString &sLowerTxt, int Pos, CString &sResult, int &EndPos) // A function that scans the document from a particular position (the Pos argument) // for known HMTL tags. // It then parses the tag to find known link attributes (e.g. HREF and IMG attributes) // and extracts the string within. { LPCTSTR szTags[] = { _T("<a"), _T("<img"), _T("<area"), _T("<embed"), _T("<option"), _T("<table"), _T("<frame"), _T("<iframe"), _T("<td"), NULL }; LPCTSTR szTokens[] = { _T("src="), _T("href="), _T("value="), _T("background="), NULL }; sResult.Empty(); LPCTSTR *pTags = szTags; int minpos = INT_MAX; int fpos; while( *pTags!=NULL ) { fpos = sLowerTxt.Find(*pTags, Pos); if( fpos>=0 && fpos<minpos) minpos = fpos; pTags++; }; if( minpos==INT_MAX ) return FALSE; EndPos = sLowerTxt.Find(_T('>'),minpos); if( EndPos<0 ) return FALSE; // From now on, we better return TRUE - even when // errors occur. This is because we have located the END tag // and we want to continue with the next tag! CString sTag; sTag = sTxt.Mid(minpos+1, EndPos-minpos); CString sLowerTag( sTag ); sLowerTag.MakeLower(); LPCTSTR *pTokens = szTokens; while( *pTokens != NULL ) { fpos = sLowerTag.Find(*pTokens); if( fpos>=0 ) break; pTokens++; }; if( fpos<0 ) return TRUE; sTag = sTag.Mid(_tcslen(*pTokens) + fpos); if( sTag.IsEmpty() ) return TRUE; sTag.TrimLeft(); TCHAR ch = sTag[0]; TCHAR EndTag; int eadd; switch( ch ) { case _T('\"'): case _T('\''): EndTag = ch; fpos=1; eadd=1; break; default: EndTag = _T(' '); fpos=0; eadd=0; }; int epos = sTag.Find(EndTag,fpos); if( epos<0 ) epos = sTag.Find('>',fpos); if( epos<0 ) return TRUE; sResult = sTag.Mid(fpos,epos-eadd); if( sResult.IsEmpty() ) return TRUE; TRACE("Found tag: %s\n", (LPCTSTR)sResult.Left(500)); EndPos++; return TRUE; }; void CDownloadFile::PrettyPath(CString& sPart, CString& sFilename) { // Path formatting sPart.Replace(_T("www."), _T("")); sPart.Replace(_T("img\\"), _T("\\")); sPart.Replace(_T("big\\"), _T("\\")); sPart.Replace(_T("Big\\"), _T("\\")); sPart.Replace(_T("img\\"), _T("\\")); sPart.Replace(_T("Img\\"), _T("\\")); sPart.Replace(_T("pic\\"), _T("\\")); sPart.Replace(_T("Pic\\"), _T("\\")); sPart.Replace(_T("pics\\"), _T("\\")); sPart.Replace(_T("Pics\\"), _T("\\")); sPart.Replace(_T("full\\"), _T("\\")); sPart.Replace(_T("Full\\"), _T("\\")); sPart.Replace(_T("image\\"), _T("\\")); sPart.Replace(_T("Image\\"), _T("\\")); sPart.Replace(_T("large\\"), _T("\\")); sPart.Replace(_T("Large\\"), _T("\\")); sPart.Replace(_T("images\\"), _T("\\")); sPart.Replace(_T("Images\\"), _T("\\")); sPart.Replace(_T("content\\"), _T("\\")); sPart.Replace(_T("Content\\"), _T("\\")); sPart.Replace(_T("gallery\\"), _T("\\")); sPart.Replace(_T("Gallery\\"), _T("\\")); sPart.Replace(_T("galleries\\"), _T("\\")); sPart.Replace(_T("Galleries\\"), _T("\\")); sPart.Replace(_T("bigimages\\"), _T("\\")); sPart.Replace(_T("BigImages\\"), _T("\\")); sPart.Replace(_T("%20"),_T(" ")); for( int x1=0x20; x1<=0x2F; x1++ ) { CString sTemp; sTemp.Format(_T("%%%02x"), x1); sPart.Replace(sTemp, _T("")); sTemp.Format(_T("%%%02X"), x1); sPart.Replace(sTemp, _T("")); } // Capitalize words bool bUpperNext = true; for( int i = 0; i < sPart.GetLength(); i++ ) { if( bUpperNext ) { sPart.SetAt(i, _totupper(sPart.GetAt(i))); bUpperNext = false; } if( sPart.GetAt(i) == '_' ) bUpperNext = true; if( sPart.GetAt(i) == '-' ) bUpperNext = true; if( sPart.GetAt(i) == ' ' ) bUpperNext = true; if( sPart.GetAt(i) == '\\' ) bUpperNext = true; if( sPart.GetAt(i) == '/' ) bUpperNext = true; } // Remove nasty characters sPart.Replace('_', ' '); } CString CDownloadFile::ExtractURL(LPCTSTR pstrValue, LPCTSTR pstrURL) // This function takes a string from the FindToken() function and turns // it into a valid URL. // The string may be passed as a relative URL, or invalid! We // must transform it to something that AfxParseURL() accepts. { ASSERT(AfxIsValidString(pstrValue)); ASSERT(AfxIsValidString(pstrURL)); CString sValue( pstrValue ); CString sValueLowerCase( sValue ); sValueLowerCase.MakeLower(); if( sValue.IsEmpty() ) return CString(); if( sValue.Left(1) == _T("#") ) return CString(); CString sServer; CString sPage; DWORD dwType; INTERNET_PORT nPort; BOOL bRes = AfxParseURL( pstrURL, dwType, sServer, sPage, nPort ); if( !bRes ) return CString(); if( (dwType!=AFX_INET_SERVICE_HTTP) && (dwType!=AFX_INET_SERVICE_HTTPS) && (dwType!=AFX_INET_SERVICE_FILE) ) { return CString(); } sServer = _T("http://") + sServer; int pos; // We're interested in the path here... CString sPath = pstrURL; pos = sPath.ReverseFind(_T('/')); if( pos>0 ) sPath = sPath.Left(pos); pos = sPath.ReverseFind(_T('\\')); if( pos>0 ) sPath = sPath.Left(pos); sPath += _T('/'); // Look for known URL constructs and return fully qualified URL... sValue.Replace(_T('\\'), _T('/')); sValue.Replace(_T("/."), _T(".")); if( sValue.ReverseFind(_T('.'))<=sValue.ReverseFind(_T('/')) && sValue.Right(1)!=_T('/') && sValue.FindOneOf(_T("?#"))<0 ) sValue += _T("/"); if( sValue[0]==_T('/') ) return sServer + sValue; if( sValueLowerCase.Left(7)==_T("http://") ) return sValue; if( sValue.Left(3)==_T("www") ) return sValue; // It's a relative URL. Use WinINet library to resolve the URL. CString sResult; DWORD dwBufLen = MAX_URL_LEN-1; bRes = ::InternetCombineUrl(sServer + sPage, sValue, sResult.GetBuffer(MAX_URL_LEN), &dwBufLen, ICU_BROWSER_MODE); sResult.ReleaseBuffer(); if( !bRes ) return CString(); return sResult; }; BOOL CDownloadFile::RandomizeSession(CSession *pSession) { DWORD dwTick = ::GetTickCount(); int iStart = pSession->m_Info.m_nFilesDownloaded + pSession->m_Info.m_nFilesSkipped; int nCount = pSession->m_Files.GetCount() - iStart; while( true ) { POSITION pos = pSession->m_Files.GetHeadPosition(); int n = (::rand() % nCount) + iStart; while( pos!=NULL ) { CDownloadFile *pFile = pSession->m_Files.GetNext(pos); if( n-- == 0 ) { if( pos != NULL && pFile->m_State == FILESTATE_WAITING) { pSession->m_Files.RemoveAt( pSession->m_Files.Find(pFile) ); pSession->m_Files.AddTail(pFile); } break; } } DWORD dwNow = ::GetTickCount(); if( (long)(dwNow - dwTick) < 0 ) break; if( dwNow - dwTick > 1500 ) break; } return TRUE; } ////////////////////////////////////////////////////////////////////// // // CFileList // ////////////////////////////////////////////////////////////////////// CFileList::~CFileList() { // This implementaiton of CFileList deletes the elements // on destruction while( !IsEmpty() ) delete RemoveHead(); }; BOOL CFileList::RescheduleBrokenDownloads(int iMaxAttempts) { POSITION pos = GetHeadPosition(); BOOL bFound = FALSE; while( pos!=NULL ) { CDownloadFile *pInetFile = GetNext(pos); if( pInetFile->m_State == FILESTATE_BROKEN && pInetFile->m_iDownloadAttempts < iMaxAttempts ) { pInetFile->m_State = FILESTATE_WAITING; bFound = TRUE; }; }; return bFound; }; CDownloadFile *CFileList::GetNextDownload() { POSITION pos = GetHeadPosition(); while( pos!=NULL ) { CDownloadFile *pInetFile = GetNext(pos); if( pInetFile->m_State == FILESTATE_WAITING ) return pInetFile; }; return NULL; }; BOOL CFileList::ItemExists(LPCTSTR szUrl) { POSITION pos = GetHeadPosition(); while( pos!=NULL ) { CDownloadFile *pInetFile = GetNext(pos); if( _tcscmp(pInetFile->m_sURL, szUrl)==0 ) return TRUE; }; return FALSE; };