1
//===========================================================
2
//== (c)2007 Foosun Inc. by dotNETCMS 1.0 ==
3
//== Forum:bbs.foosun.net ==
4
//== website:www.foosun.net ==
5
//== Address:NO.109 HuiMin ST.,Chengdu ,China ==
6
//== TEL:86-28-85098980/66026180 ==
7
//== qq:655071,MSN:ikoolls@gmail.com ==
8
//== Code By JiangDong ==
9
//===========================================================
10
using System;
11
using System.IO;
12
using System.Data;
13
using System.Net;
14
using System.Text;
15
using Foosun.Model;
16
using Foosun.Control;
17
18
namespace Foosun.CMS.Collect
19
...{
20
/**//// <summary>
21
/// 采集类
22
/// </summary>
23
public class Collect
24
...{
25
private Foosun.DALFactory.ICollect dal;
26
private string ErrorMsg = "";
27
/**//// <summary>
28
/// 构造函数
29
/// </summary>
30
public Collect()
31
...{
32
dal = Foosun.DALFactory.DataAccess.CreateCollect();
33
}
34
采集入库#region 采集入库
35
/**//// <summary>
36
/// 是否保存远程图片
37
/// </summary>
38
private bool bSaveRemotePic = false;
39
private string PicSavePath = "";
40
private string PicSaveUrl = "";
41
/**//// <summary>
42
/// 开始采集
43
/// </summary>
44
/// <param name="folderid">目录名称</param>
45
/// <param name="num">采集数量</param>
46
public void Collecting(int folderid, int num, bool bnorepeat)
47
...{
48
HProgressBar.Start("正在读取列表数据");
49
DataTable tb = GetSite(folderid);
50
检查数据是否完整#region 检查数据是否完整
51
if (tb == null || tb.Rows.Count < 1)
52
...{
53
HProgressBar.Roll("没有找到该站点的相关记录!", 0);
54
return;
55
}
56
DataRow r = tb.Rows[0];
57
if (r.IsNull("LinkSetting") || r.IsNull("PageTitleSetting") || r.IsNull("PagebodySetting"))
58
...{
59
HProgressBar.Roll("相关的参数没有设置,无法取得新闻列表!", 0);
60
return;
61
}
62
if (bool.Parse(r["SaveRemotePic"].ToString()))
63
...{
64
远程图片#region 远程图片
65
string rtpath = Foosun.Config.UIConfig.dirFile;
66
if (rtpath == null || rtpath.Trim().Equals(""))
67
...{
68
HProgressBar.Roll("没有找到管理员附件目录!", 0);
69
return;
70
}
71
string dtpath = DateTime.Now.ToString("yyyyMMdd");
72
PicSavePath = Foosun.Common.ServerInfo.GetRootPath().TrimEnd('\\') + @"\" + rtpath + @"\RemoteFiles\" + dtpath;
73
if (!Directory.Exists(PicSavePath))
74
Directory.CreateDirectory(PicSavePath);
75
PicSaveUrl = Foosun.Publish.CommonData.getUrl() + "/" + rtpath + "/RemoteFiles/" + dtpath;
76
bSaveRemotePic = true;
77
#endregion
78
}
79
#endregion 检查数据是否完整
80
HProgressBar.Roll("正在获取新闻列表页", 0);
81
82
string sListUrl = r["objURL"].ToString();
83
string sEncode = r["Encode"].ToString();
84
bool bReverse = bool.Parse(r["IsReverse"].ToString());
85
string listset = @"<body[^>]*>(?<list>[\s\S]+?)</body>";
86
if (!r.IsNull("ListSetting"))
87
listset = r["ListSetting"].ToString();
88
PageList PL = new PageList(r["objURL"].ToString(), r["Encode"].ToString());
89
PL.RuleOfList = listset;
90
PL.RuleOfLink = r["LinkSetting"].ToString();
91
string[] NewsUrl = GetNewsList(PL);
92
int len = NewsUrl.Length;
93
if (len < num)
94
...{
95
int pagetype = int.Parse(r["OtherType"].ToString());
96
string[] otherurl = null;
97
switch (pagetype)
98
...{
99
case 0:
100
break;
101
case 1://递归
102
otherurl = PL.Pagination(r["OtherPageSetting"].ToString(), num - len);
103
break;
104
case 2://其他页
105
otherurl = PL.SinglePagination(r["OtherPageSetting"].ToString(), num - len);
106
break;
107
case 3://索引页
108
otherurl = PL.IndexPagination(r["OtherPageSetting"].ToString(), int.Parse(r["StartPageNum"].ToString()), int.Parse(r["EndPageNum"].ToString()), num - len);
109
break;
110
default:
111
break;
112
}
113
if (otherurl != null && otherurl.Length > 0)
114
...{
115
Array.Resize(ref NewsUrl, len + otherurl.Length);
116
otherurl.CopyTo(NewsUrl, len);
117
}
118
}
119
if (NewsUrl.Length < 1)
120
...{
121
HProgressBar.Roll("从列表内容中没有找到任何新闻的相关链接!", 0);
122
return;
123
}
124
if (bReverse)
125
Array.Reverse(NewsUrl);
126
HProgressBar.Roll("开始采集新闻", 0);
127
int nSucceed = 0, nFailed = 0, nRepeat = 0;
128
for (int i = 0; i < NewsUrl.Length; i++)
129
...{
130
if (i >= num)
131
break;
132
try
133
...{
134
int flag = CollectPage(NewsUrl[i], r, bnorepeat);
135
if (flag != 1)
136
...{
137
nSucceed++;
138
if (flag == -1)
139
nRepeat++;
140
}
141
else
142
nFailed++;
143
}
144
catch
145
...{
146
nFailed++;
147
}
148
string prompt = "正在采集新闻,终止<a href=\"Collect_List.aspx\">返回</a>.成功:" + nSucceed * 100 / num + "% ";
149
if (nRepeat > 0)
150
prompt += "(其中重复:" + nRepeat * 100 / num + "%) ";
151
prompt += "失败:" + nFailed * 100 / num + "%";
152
HProgressBar.Roll(prompt, (i + 1) * 100 / num);
153
}
154
}
155
/**//// <summary>
156
/// 处理采集单条新闻
157
/// </summary>
158
/// <param name="Url"></param>
159
/// <param name="r"></param>
160
/// <param name="norepeat"></param>
161
/// <returns>0为成功,-1为重复,1,为失败</returns>
162
private int CollectPage(string Url, DataRow r, bool norepeat)
163
...{
164
if (Url == null || Url.Trim().Equals(""))
165
return 1;
166
PageNews pn = new PageNews(Url, r["Encode"].ToString());
167
if (!pn.Fetch())
168
return 1;
169
pn.RuleOfTitle = r["PageTitleSetting"].ToString();
170
pn.RuleOfContent = r["PagebodySetting"].ToString();
171
pn.FigureTitle();
172
if (norepeat)
173
...{
174
if (dal.TitleExist(pn.Title))
175
return -1;
176
}
177
pn.FigureContent();
178
if (r.IsNull("HandSetAuthor"))
179
...{
180
pn.FigureAuthor(r["AuthorSetting"].ToString(), false);
181
}
182
else
183
...{
184
pn.FigureAuthor(r["HandSetAuthor"].ToString(), true);
185
}
186
if (r.IsNull("HandSetSource"))
187
...{
188
pn.FigureSource(r["SourceSetting"].ToString(), false);
189
}
190
else
191
...{
192
pn.FigureSource(r["HandSetSource"].ToString(), true);
193
}
194
if (r.IsNull("HandSetAddDate"))
195
...{
196
pn.FigureAddTime(r["AddDateSetting"].ToString(), false);
197
}
198
else
199
...{
200
pn.FigureAddTime(r["HandSetAddDate"].ToString(), true);
201
}
202
int pgtp = int.Parse(r["OtherNewsType"].ToString());
203
if (pgtp == 1)
204
...{
205
pn.Content += pn.GetOtherPagination(r["OtherNewsPageSetting"].ToString());
206
}
207
else if (pgtp == 2)
208
...{
209
pn.Content += pn.GetIndexPagination(r["OtherNewsPageSetting"].ToString());
210
}
211
pn.Filter(bool.Parse(r["TextTF"].ToString()),
212
bool.Parse(r["IsStyle"].ToString()), bool.Parse(r["IsDIV"].ToString()), bool.Parse(r["IsA"].ToString()),
213
bool.Parse(r["IsClass"].ToString()), bool.Parse(r["IsFont"].ToString()), bool.Parse(r["IsSpan"].ToString()),
214
bool.Parse(r["IsObject"].ToString()), bool.Parse(r["IsIFrame"].ToString()), bool.Parse(r["IsScript"].ToString()));
215
if (!r.IsNull("OldContent") && !r.IsNull("ReContent") && !r.IsNull("IgnoreCase"))
216
pn.Replace(r["OldContent"].ToString(), r["ReContent"].ToString(), bool.Parse(r["IgnoreCase"].ToString()));
217
if (pn.Content != null && !pn.Content.Trim().Equals("") && !pn.Title.Trim().Equals(""))
218
...{
219
Foosun.Model.CollectNewsInfo ninf = new Foosun.Model.CollectNewsInfo();
220
ninf.Author = pn.Author;
221
ninf.Source = pn.Source;
222
ninf.AddDate = pn.AddTime;
223
ninf.Title = pn.Title;
224
ninf.SiteID = int.Parse(r["ID"].ToString());
225
ninf.Links = Url;
226
ninf.ClassID = r["ClassID"].ToString();
227
string Content = pn.Content;
228
if (bSaveRemotePic)
229
...{
230
RemoteResource rs = new RemoteResource(Content, PicSaveUrl, PicSavePath, Url, true);
231
rs.FetchResource();
232
Content = rs.Content;
233
}
234
ninf.Content = Content;
235
try
236
...{
237
NewsAdd(ninf);
238
return 0;
239
}
240
catch
241
...{
242
return 1;
243
}
244
}
245
else
246
...{
247
return 1;
248
}
249
}
250
private string[] GetNewsList(PageList pagelist)
251
...{
252
if (!pagelist.Fetch())
253
...{
254
HProgressBar.Roll(pagelist.LastError, 0);
255
}
256
pagelist.FigureList();
257
pagelist.FigureNewsUrls();
258
return pagelist.NewsUrl;
259
}
260
#endregion
261
262
/**//// <summary>
263
/// 获取目录信息和采集站点分页
264
/// </summary>
265
/// <param name="FolderID">目录ID,如果小于1则为根目录,否则只获取该目录下的站点</param>
266
/// <param name="PageIndex">当前页码</param>
267
/// <param name="PageSize">每页记录数</param>
268
/// <param name="RecordCount">返回记录总条数</param>
269
/// <param name="PageCount">返回总页数</param>
270
/// <returns>返回当前页的数据集</returns>
271
public DataTable GetFolderSitePage(int FolderID, int PageIndex, int PageSize, out int RecordCount, out int PageCount)
272
...{
273
return dal.GetFolderSitePage(FolderID, PageIndex, PageSize, out RecordCount, out PageCount);
274
}
275
/**//// <summary>
276
/// 复制目录
277
/// </summary>
278
/// <param name="id">要复制的目录ID</param>
279
public void FolderCopy(int id)
280
...{
281
dal.FolderCopy(id);
282
}
283
/**//// <summary>
284
/// 复制采集站点
285
/// </summary>
286
/// <param name="id">要复制的站点的ID</param>
287
public void SiteCopy(int id)
288
...{
289
dal.SiteCopy(id);
290
}
291
/**//// <summary>
292
/// 删除采集目录
293
/// </summary>
294
/// <param name="id">要删除的目录ID</param>
295
public void FolderDelete(int id)
296
...{
297
dal.FolderDelete(id);
298
}
299
/**//// <summary>
300
/// 删除采集站点
301
/// </summary>
302
/// <param name="id">要删除的站点ID</param>
303
public void SiteDelete(int id)
304
...{
305
dal.SiteDelete(id);
306
}
307
/**//// <summary>
308
/// 获取指定的目录信息(用于目录修改)
309
/// </summary>
310
/// <param name="id">要获取的目录ID</param>
311
/// <returns>指定的数据</returns>
312
public DataTable GetFolder(int id)
313
...{
314
return dal.GetFolder(id, false);
315
}
316
/**//// <summary>
317
/// 获取所有的目录的信息
318
/// </summary>
319
/// <returns>返回所有的目录信息</returns>
320
public DataTable GetFolder()
321
...{
322
return dal.GetFolder(0, true);
323
}
324
/**//// <summary>
325
/// 获取指定的采集站点信息(主要用于站点信息修改和设置)
326
/// </summary>
327
/// <param name="id">站点的ID</param>
328
/// <returns>数据集</returns>
329
public DataTable GetSite(int id)
330
...{
331
return dal.GetSite(id);
332
}
333
/**//// <summary>
334
/// 新增一个采集站点
335
/// </summary>
336
/// <param name="st">新增的采集站点信息</param>
337
/// <returns>返回新增的站点的自动编号</returns>
338
public int SiteAdd(CollectSiteInfo st)
339
...{
340
Encoding end = Encoding.GetEncoding(st.Encode);
341
if (!this.ValidateUrl(st.objURL))
342
throw new Exception(ErrorMsg);
343
return dal.SiteAdd(st);
344
}
345
public void SiteUpdate(CollectSiteInfo st, int step)
346
...{
347
if (step.Equals(1))
348
...{
349
Encoding end = Encoding.GetEncoding(st.Encode);
350
if (!this.ValidateUrl(st.objURL))
351
throw new Exception(ErrorMsg);
352
}
353
else if (step.Equals(2))
354
...{
355
switch (st.OtherType)
356
...{
357
case 0:
358
st.OtherPageSetting = "";
359
st.StartPageNum = -1;
360
st.EndPageNum = -1;
361
break;
362
case 1:
363
case 2:
364
st.StartPageNum = -1;
365
st.EndPageNum = -1;
366
break;
367
}
368
}
369
dal.SiteUpdate(st, step);
370
}
371
public int FolderAdd(string Name, string Description)
372
...{
373
return dal.FolderAdd(Name, Description);
374
}
375
public void FolderUpdate(int id, string Name, string Description)
376
...{
377
dal.FolderUpdate(id, Name, Description);
378
}
379
private bool ValidateUrl(string sUrl)
380
...{
381
bool flag = false;
382
try
383
...{
384
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(sUrl);
385
req.KeepAlive = false;
386
HttpWebResponse rsp = (HttpWebResponse)req.GetResponse();
387
rsp.Close();
388
flag = true;
389
}
390
catch (WebException e)
391
...{
392
HttpWebResponse response = (HttpWebResponse)e.Response;
393
if (response != null)
394
...{
395
if (response.StatusCode == HttpStatusCode.Unauthorized)
396
...{
397
string challenge = null;
398
challenge = response.GetResponseHeader("WWW-Authenticate");
399
if (challenge != null)
400
ErrorMsg = challenge;
401
}
402
else
403
ErrorMsg = e.Message;
404
}
405
else
406
ErrorMsg = "请检查采集对象页地址,不能从服务器取得任何信息!";
407
408
}
409
catch (Exception e)
410
...{
411
ErrorMsg = e.Message;
412
}
413
return flag;
414
}
415
public DataTable GetRulePage(int PageIndex, int PageSize, out int RecordCount, out int PageCount)
416
...{
417
return dal.GetRulePage(PageIndex, PageSize, out RecordCount, out PageCount);
418
}
419
public void RuleDelete(int id)
420
...{
421
dal.RuleDelete(id);
422
}
423
public int RuleAdd(string Name, string OldStr, string NewStr, int[] AppSites, bool IgnoreCase)
424
...{
425
return dal.RuleAdd(Name, OldStr, NewStr, AppSites, IgnoreCase);
426
}
427
public void RuleUpdate(int RuleID, string Name, string OldStr, string NewStr, int[] AppSites, bool IgnoreCase)
428
...{
429
dal.RuleUpdate(RuleID, Name, OldStr, NewStr, AppSites, IgnoreCase);
430
}
431
public DataTable GetRule(int id)
432
...{
433
return dal.GetRule(id);
434
}
435
public DataTable SiteList()
436
...{
437
return dal.SiteList();
438
}
439
public void NewsAdd(CollectNewsInfo newsinfo)
440
...{
441
dal.NewsAdd(newsinfo);
442
}
443
public DataTable GetNewsPage(int PageIndex, int PageSize, out int RecordCount, out int PageCount)
444
...{
445
return dal.GetNewsPage(PageIndex, PageSize, out RecordCount, out PageCount);
446
}
447
public void NewsDelete(string id)
448
...{
449
dal.NewsDelete(id);
450
}
451
public CollectNewsInfo GetNews(int id)
452
...{
453
return dal.GetNews(id);
454
}
455
public void NewsUpdate(int id, CollectNewsInfo info)
456
...{
457
dal.NewsUpdate(id, info);
458
}
459
460
新闻入库#region 新闻入库
461
/**//// <summary>
462
/// 新闻入库
463
/// </summary>
464
/// <param name="id">如果为0表示入库所有未入库的新闻,否则为要入库的新闻的编号,以,分隔</param>
465
public void StorageNews(string id)
466
...{
467
string s = "请点击<a href=\"Collect_News.aspx\">这里返回</a>";
468
try
469
...{
470
HTextProgressBar.Start("正在统计数据");
471
int[] nid = null;
472
473
bool bUnAll = false;
474
if (id == "0")
475
...{
476
bUnAll = true;
477
}
478
else
479
...{
480
if (id.IndexOf(",") > 0)
481
...{
482
string[] _id = id.Split(',');
483
int num = _id.Length;
484
nid = new int[num];
485
for (int i = 0; i < num; i++)
486
nid[i] = int.Parse(_id[i]);
487
}
488
else
489
...{
490
nid = new int[] ...{ int.Parse(id) };
491
}
492
if (nid.Length < 1)
493
HTextProgressBar.EndProgress("没有选择要入库的采集新闻!" + s);
494
}
495
int ns = 0;
496
int nf = 0;
497
HTextProgressBar.ShowText("开始入库采集新闻,请稍候。要终止," + s);
498
dal.StoreNews(bUnAll, nid, out ns, out nf);
499
HTextProgressBar.EndProgress("采集新闻入库已完成。共成功:" + ns + "条,失败:" + nf + "条新闻。" + s);
500
}
501
catch (Exception ex)
502
...{
503
HTextProgressBar.EndProgress("采集新闻入库异常终止。异常信息:" + ex.Message + "<br/>" + s);
504
}
505
}
506
#endregion 新闻入库
507
}
508
}
509