-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathProgram.cs
292 lines (257 loc) · 13 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using HtmlAgilityPack;
namespace ImageDownloader
{
class Program
{
static List<char> alph = new List<char>();
static char[] startUrl = "aaaaa".ToCharArray();
static void Main(string[] args)
{
for (char c = 'a'; c <= 'z'; ++c)
{
alph.Add(c);
}
alph.AddRange(Enumerable.Range(1, 9).Select(x => x.ToString()[0]).ToArray());
HtmlReaderManager hrm = new HtmlReaderManager();
LimitedConcurrencyLevelTaskScheduler lcts =
new LimitedConcurrencyLevelTaskScheduler(200);
TaskFactory factory = new TaskFactory(lcts);
var tasks = new List<Task>();
int counter = 1;
foreach (string url in GetNextUrl())
{
Options opt = new Options();
opt.Url = url;
opt.counter = counter;
tasks.Add(factory.StartNew((opts) =>
{
bool isDownloaded = false;
int tryCount = 0;
do
{
Options options = (Options) opts;
string targetUrl = "http://prntscr.com/1" + options.Url;
try
{
hrm.Get(targetUrl);
string html = hrm.Html;
HtmlAgilityPack.HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string imgUrl = doc.DocumentNode.SelectSingleNode("//meta[@name='twitter:image:src']").Attributes["content"].Value;
if (!string.IsNullOrEmpty(imgUrl))
{
ImageDownloader.DownloadRemoteImageFile(imgUrl, "images/" + Guid.NewGuid() + ".jpeg");
Console.WriteLine(options.counter + ". Скачиваем " + imgUrl);
}
else
Console.WriteLine(options.counter + ". " + imgUrl + " пуст");
break;
}
catch (Exception ex)
{
tryCount++;
if (ex.ToString().StartsWith("System.Net.WebException: The remote server returned an error: (503) Server Unavailable.") && tryCount<20)
{
Random s = new Random();
int randomVal = s.Next(1000, 10000);
Thread.Sleep(randomVal);
Console.WriteLine(options.counter + ". ждём " + targetUrl +" ("+tryCount+" раз)");
}
else
{
Console.WriteLine(options.counter + ". ошибка " + targetUrl);
isDownloaded = true;
}
}
} while (!isDownloaded);
}, opt, TaskCreationOptions.LongRunning));
if (counter%200 == 0)
Task.WaitAll(tasks.ToArray());
tasks.Clear();
counter++;
}
Console.ReadLine();
}
private static IEnumerable GetNextUrl(int checkCharIndex=0)
{
foreach (char c in alph)
{
startUrl[checkCharIndex] = c;
if((startUrl.Length-1)!=checkCharIndex)
{
foreach (var innC in GetNextUrl(checkCharIndex + 1))
{
yield return innC;
}
}
else
{
yield return new string(startUrl);
}
}
}
}
public struct Options
{
public string Url;
public int counter;
}
public class ImageDownloader
{
public static void DownloadRemoteImageFile(string uri, string fileName)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
// Check that the remote file was found. The ContentType
// check is performed since a request for a non-existent
// image file might be redirected to a 404-page, which would
// yield the StatusCode "OK", even though the image was not
// found.
if ((response.StatusCode == HttpStatusCode.OK ||
response.StatusCode == HttpStatusCode.Moved ||
response.StatusCode == HttpStatusCode.Redirect) &&
response.ContentType.StartsWith("image", StringComparison.OrdinalIgnoreCase))
{
// if the remote file was found, download oit
using (Stream inputStream = response.GetResponseStream())
using (Stream outputStream = File.OpenWrite(fileName))
{
byte[] buffer = new byte[4096];
int bytesRead;
do
{
bytesRead = inputStream.Read(buffer, 0, buffer.Length);
outputStream.Write(buffer, 0, bytesRead);
} while (bytesRead != 0);
}
}
}
}
/// <summary>
/// Provides a task scheduler that ensures a maximum concurrency level while
/// running on top of the ThreadPool.
/// http://msdn.microsoft.com/en-us/library/ee789351.aspx
/// </summary>
public class LimitedConcurrencyLevelTaskScheduler : TaskScheduler
{
/// <summary>Whether the current thread is processing work items.</summary>
[ThreadStatic]
private static bool _currentThreadIsProcessingItems;
/// <summary>The list of tasks to be executed.</summary>
private readonly LinkedList<Task> _tasks = new LinkedList<Task>(); // protected by lock(_tasks)
/// <summary>The maximum concurrency level allowed by this scheduler.</summary>
private readonly int _maxDegreeOfParallelism;
/// <summary>Whether the scheduler is currently processing work items.</summary>
private int _delegatesQueuedOrRunning = 0; // protected by lock(_tasks)
/// <summary>
/// Initializes an instance of the LimitedConcurrencyLevelTaskScheduler class with the
/// specified degree of parallelism.
/// </summary>
/// <param name="maxDegreeOfParallelism">The maximum degree of parallelism provided by this scheduler.</param>
public LimitedConcurrencyLevelTaskScheduler(int maxDegreeOfParallelism)
{
if (maxDegreeOfParallelism < 1) throw new ArgumentOutOfRangeException("maxDegreeOfParallelism");
_maxDegreeOfParallelism = maxDegreeOfParallelism;
}
/// <summary>Queues a task to the scheduler.</summary>
/// <param name="task">The task to be queued.</param>
protected sealed override void QueueTask(Task task)
{
// Add the task to the list of tasks to be processed. If there aren't enough
// delegates currently queued or running to process tasks, schedule another.
lock (_tasks)
{
_tasks.AddLast(task);
if (_delegatesQueuedOrRunning < _maxDegreeOfParallelism)
{
++_delegatesQueuedOrRunning;
NotifyThreadPoolOfPendingWork();
}
}
}
/// <summary>
/// Informs the ThreadPool that there's work to be executed for this scheduler.
/// </summary>
private void NotifyThreadPoolOfPendingWork()
{
ThreadPool.UnsafeQueueUserWorkItem(_ =>
{
// Note that the current thread is now processing work items.
// This is necessary to enable inlining of tasks into this thread.
_currentThreadIsProcessingItems = true;
try
{
// Process all available items in the queue.
while (true)
{
Task item;
lock (_tasks)
{
// When there are no more items to be processed,
// note that we're done processing, and get out.
if (_tasks.Count == 0)
{
--_delegatesQueuedOrRunning;
break;
}
// Get the next item from the queue
item = _tasks.First.Value;
_tasks.RemoveFirst();
}
// Execute the task we pulled out of the queue
base.TryExecuteTask(item);
}
}
// We're done processing items on the current thread
finally { _currentThreadIsProcessingItems = false; }
}, null);
}
/// <summary>Attempts to execute the specified task on the current thread.</summary>
/// <param name="task">The task to be executed.</param>
/// <param name="taskWasPreviouslyQueued"></param>
/// <returns>Whether the task could be executed on the current thread.</returns>
protected sealed override bool TryExecuteTaskInline(Task task, bool taskWasPreviouslyQueued)
{
// If this thread isn't already processing a task, we don't support inlining
if (!_currentThreadIsProcessingItems) return false;
// If the task was previously queued, remove it from the queue
if (taskWasPreviouslyQueued) TryDequeue(task);
// Try to run the task.
return base.TryExecuteTask(task);
}
/// <summary>Attempts to remove a previously scheduled task from the scheduler.</summary>
/// <param name="task">The task to be removed.</param>
/// <returns>Whether the task could be found and removed.</returns>
protected sealed override bool TryDequeue(Task task)
{
lock (_tasks) return _tasks.Remove(task);
}
/// <summary>Gets the maximum concurrency level supported by this scheduler.</summary>
public sealed override int MaximumConcurrencyLevel { get { return _maxDegreeOfParallelism; } }
/// <summary>Gets an enumerable of the tasks currently scheduled on this scheduler.</summary>
/// <returns>An enumerable of the tasks currently scheduled.</returns>
protected sealed override IEnumerable<Task> GetScheduledTasks()
{
bool lockTaken = false;
try
{
Monitor.TryEnter(_tasks, ref lockTaken);
if (lockTaken) return _tasks.ToArray();
else throw new NotSupportedException();
}
finally
{
if (lockTaken) Monitor.Exit(_tasks);
}
}
}
}