NCrawler中使用Cookie登录

news/2024/5/18 15:45:56 标签: webclient, cookie

代码片段

using System.Net;

namespace Crawler
{
    public class CookiesAwareWebClient : WebClient
    {
        private CookieContainer outboundCookies = new CookieContainer();
        private CookieCollection inboundCookies = new CookieCollection();

        public CookieContainer OutboundCookies
        {
            get { return outboundCookies; }
        }

        public CookieCollection InboundCookies
        {
            get { return inboundCookies; }
        }

        public bool IgnoreRedirects { get; set; }

        protected override WebRequest GetWebRequest(System.Uri address)
        {
            var request = base.GetWebRequest(address);
            if (request is HttpWebRequest)
            {
                (request as HttpWebRequest).CookieContainer = outboundCookies;
                (request as HttpWebRequest).AllowAutoRedirect = !IgnoreRedirects;
                (request as HttpWebRequest).UserAgent =
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705;)";
                (request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
            }
            return request;
        }

        protected override WebResponse GetWebResponse(WebRequest request)
        {
            var response = base.GetWebResponse(request);
            if (response is HttpWebResponse)
            {
                inboundCookies = (response as HttpWebResponse).Cookies ?? inboundCookies;
            }
            return response;
        }

    }
}

 CustomDownloaderModule.cs Raw
using System.Net;
using NCrawler;
using Autofac;
using NCrawler.Interfaces;

namespace Crawler
{
    public class CustomDownloaderModule : NCrawlerModule
    {
        private readonly CookieContainer _cookieContainer;

        public CustomDownloaderModule(CookieContainer cookieContainer)
        {
            _cookieContainer = cookieContainer;
        }

        protected override void Load(ContainerBuilder builder)
        {
            base.Load(builder);

            builder.Register(c => new CustomWebDownloader(_cookieContainer))
                   .As<IWebDownloader>()
                   .SingleInstance()
                   .ExternallyOwned();
        }

        public static void Setup(CookieContainer cookieContainer)
        {
            Setup(cookieContainer);
        }
    }
}
 CustomWebDownloader.cs Raw

using System.Net;
using NCrawler.Services;

namespace Crawler
{
    public class CustomWebDownloader : WebDownloaderV2
    {
        private readonly CookieContainer _cookieContainer;

        public CustomWebDownloader(CookieContainer cookieContainer)
        {
            _cookieContainer = cookieContainer;
        }

        protected override void SetDefaultRequestProperties(HttpWebRequest request)
        {
            base.SetDefaultRequestProperties(request);
            request.CookieContainer = _cookieContainer;
        }
    }
}

 Program.cs Raw

using System;
using NCrawler;
using NCrawler.Services;
using Module = Autofac.Module;
using NCrawler.HtmlProcessor;
using NCrawler.Interfaces;

public static Main(string[] args)
{
    var authorizedCookies = GetAuthorizationCookie(new Uri("http://mysecuresite.com/login.html"));            
    var modules = new Module[] { new CustomDownloaderModule(authorizedCookies)};
    NCrawlerModule.Setup(modules);
   using(Crawler c = new Crawler("http://mysecuresite.com/", new HtmlDocumentProcessor()))
   {
     c.Crawl();
   }
}

  private static CookieContainer GetAuthorizationCookie(Uri loginPage)
  {
      CookieContainer cookies;

      //Put all required form post data here.
      var postData = new NameValueCollection
                        {
                            {"userid", "user1"},
                            {"pwd", "password"},
                        };

      using (var client = new CookiesAwareWebClient())
      {
          client.IgnoreRedirects = false;
          //Load Page via get request to initialize cookies...
          client.DownloadData(loginPage);
          //Add cookies to the outbound request.
          client.OutboundCookies.Add(client.InboundCookies);
          client.UploadValues(loginPage, "POST", postData);
          //Add latest cookies (includes the authorization to the cookie collection)
          client.OutboundCookies.Add(client.InboundCookies);
          cookies = client.OutboundCookies;
      }

      if (cookies == null || cookies.Count == 0)
      {
          Console.Writeline("Authorization Cookies are null or empty.");
      }
      else
      {
          Console.Writeline("Authorization Cookies obtained.");
      }

      return cookies;
}

Readme.txt Raw

How to add custom authentication using cookies based authorization from POST based login page.

  1. Create cookies aware web client so that we can obtain the required cookies.
  2. Create CustomWebDownloader that inherits from WebDownloaderV2 and overrides the cookie behavior.
  3. Create custom NCrawlerModule that will implement the CustomWebDownloader.
  4. Get the required login cookis for the session.
  5. Register CustomDownloadModule passing in the authorized cookies. Note that the last item registered with Autofac will be the one used, so our CustomWebDownloader will now replace the default WebDownloaderV2.
  6. Crawl the site, it will not use the CustomWebDownloader for all links crawled.

http://www.niftyadmin.cn/n/1410856.html

相关文章

java web方面的面试问题,Spring MVC方面的面试问题,摘自java web轻量级开发面试教程...

本文摘自java web轻量级开发面试教程&#xff1a; https://baike.baidu.com/item/Java%20Web%E8%BD%BB%E9%87%8F%E7%BA%A7%E5%BC%80%E5%8F%91%E9%9D%A2%E8%AF%95%E6%95%99%E7%A8%8B/22038502?fraladdin 下面列出Spring Web方面的常见问题&#xff0c;除此之外&#xff0c;大家…

EF 批量更新/删除数据

在网上找了很久,得到的答案是”Entity Framework 中不能同时更新多条记录”,历经这么多版本,居然还没有这种基本功能,我真的很无语了.还要先查询出来,然后再对实体更新或删除,那效率可想而知了…… 在网上找了找解决方案,比如说这个 EF架构~性能高效的批量操作(Update篇&…

8月28日学习内容整理:文件内容补充,浮点型复数型,函数定义,调用,返回值,参数...

补充&#xff1a; 一、文件 1、尽量不要对同一个文件进行又读又写操作&#xff0c;这样会很乱 2、objopen(.....) obj称为文件句柄&#xff0c;文件操作符 3、readlines() 也是读整个文件&#xff0c;输出的是列表类型 4、只读r模式&#xff0c;按光标&#xff08;指针…

比ThreadPool对象更好用的线程控制对象

Task对象很多人知道了(使用Task代替ThreadPool和Thread, C#线程篇—Task&#xff08;任务&#xff09;和线程池不得不说的秘密&#xff08;5&#xff09;) 相对的还有TaskScheduler 这个调度器,可以自定义调度器,只要重写TaskScheduler 方法就可以了 微软原来一早就对他进行…

EF实体类指定部分属性不映射成数据库字段的方法

无营养贴 在想要不映射的字段上加上[NotMapped]标记 如: [NotMapped]public string Status { get; set; }

EF中扩展出Between操作符

using System; using System.Linq; using System.Linq.Expressions;namespace FuLu.Finance.Assistant.Common.Linq {public static class LinqExtension{/// <summary>/// 扩展Between 操作符/// 使用 var query db.People.Between(person > person.Age, 18, 21);//…

管理你的Visual Studio Toolbox

原文链接 The Most Complete Guide to Visual Studio Toolbox Control Integration Libor Tinka, Lead Developer, ComponentOwl.comContents 1. Introduction2. Prerequisites3. Creating a Sample Control4. Manual Toolbox Integration5. Toolbox Integration using TCI6. T…

linux下的开源移动图像监测程序--motion编译与配置【转】

本文转载自&#xff1a;http://www.cnblogs.com/qinyg/p/3355707.html 前几天在网上偶然看到一篇博客&#xff0c;是利用linxu下的开源的motion搭建嵌入式视频动态监控系统&#xff0c;感觉很好很强大于&#xff0c;是就想自己编译移植一下试试。 所谓移动图像监测&#xff0c;…