-- Search news pages -- Public Domain - written by Rob Craig, February 2008 -- This demo reads numerous Web pages in parallel -- and reports on the number of occurences of a word or phrase. -- Each page is handled by a separate Euphoria task running -- in parallel with several other tasks. Pages that contain -- matches are displayed using your default web browser. -- The search words are shown in red with italics in a very large font. -- This demo uses Euphoria's new multitasking feature. -- It creates multiple wget background processes, each retrieving one Web page. -- You can get a version of wget for Windows from: -- -- http://www.gnu.org/software/wget/wget.html -- A Euphoria task is assigned to each instance of wget, searching the -- Web page text as it arrives. In this way, when a task is blocked due -- to a delayed response from a particular server, the program can easily -- switch to another task that is not blocked. The program quits after a -- period of 10-15 seconds with no progress made on any page. -- News Sources - Add your favorite sites here... sequence URLs -- good to have "/" at end of top level domain URLs = { "www.cbc.ca/news/", "www.juancole.com/", "www.abc.net.au/", "abcnews.go.com/", "english.aljazeera.net/HomePage", "news.bbc.co.uk/", "www.cbsnews.com/", "cnn.com/", "www.democracynow.org/index.pl", "www.foxnews.com/", "www.guardian.co.uk/", "www.msnbc.msn.com/", "www.reuters.com/", "www.whatreallyhappened.com/", "news.yahoo.com/" } include wildcard.e include graphics.e include dll.e include machine.e constant SW_SHOWNORMAL = 1 -- We use ShellExecute to open the HTML files using the default browser atom shell32 shell32 = open_dll("shell32.dll") if shell32 = NULL then puts(1, "Couldn't find shell32.dll\n") abort(1) end if integer ShellExecute ShellExecute = define_c_proc(shell32, "ShellExecuteA", {C_LONG, C_LONG, C_LONG, C_LONG, C_LONG, C_LONG}) if ShellExecute = -1 then puts(1, "Couldn't link to ShellExecuteA\n") abort(1) end if bk_color(0) sequence cl object search_phrase cl = command_line() if length(cl) >= 3 then search_phrase = cl[3] else puts(1, "Enter search word or phrase:\n") search_phrase = gets(0) if atom(search_phrase) or length(search_phrase) < 2 then abort(1) elsif search_phrase[$] = '\n' then search_phrase = search_phrase[1..$-1] end if end if sequence null_device, del_cmd if platform() = LINUX then URLs = URLs[1..9] -- less room on screen null_device = "/dev/null" del_cmd = "rm" else null_device = "NUL" del_cmd = "del" end if constant PREFIX = "", POSTFIX = "" function set_base_href(sequence fname, sequence url, sequence search_string) -- insert base href sequence newname integer new, old, m, start object line newname = "mod" & fname new = open(newname, "w") old = open(fname, "r") if new = -1 or old = -1 then puts(2, "couldn't open file!\n") return "" end if -- some pages need this: puts(new, "\n") -- copy file while 1 do line = gets(old) if atom(line) then exit end if start = 1 while 1 do m = match_from(search_string, upper(line), start) if m = 0 then exit end if if m then line = line[1..m-1] & PREFIX & search_string & POSTFIX & line[m+length(search_string)..$] end if start = m + length(search_string) + length(PREFIX) + length(POSTFIX) end while puts(new, line) end while close(old) close(new) return newname end function integer progress, quit procedure task_search_url(sequence url, sequence string) -- download a Web page and search it for a string integer f, hits integer line_count object line sequence mytemp, modtemp, ustring atom html_file, open_str text_color(YELLOW) position(task_self()+1, 1) printf(1, "task %2.0f: %-35s waiting for wget...", {task_self(), url}) ustring = upper(string) hits = 0 -- run a copy of wget as a background process mytemp = sprintf("newstemp%.0f.html", task_self()) system(sprintf("wget -q -b -O %s %s > %s", {mytemp, "http://" & url, null_device}), 2) f = -1 while f = -1 do -- wait until file exists if quit then return end if task_schedule(task_self(), {1.0, 2.0}) task_yield() f = open(mytemp, "rb") end while text_color(BRIGHT_WHITE) position(task_self()+1, 1) printf(1, "task %2.0f: %-35s waiting for data...", {task_self(), url}) text_color(WHITE) line_count = 0 while 1 do line = gets(f) if atom(line) then -- could be actual end-of-file, or maybe there's more coming task_schedule(task_self(), {1.0, 1.5}) while 1 do line = gets(f) if sequence(line) then exit -- more data came in end if if quit then close(f) if hits then -- display this page, but first insert base href modtemp = set_base_href(mytemp, url, ustring) html_file = allocate_string(modtemp) -- mytemp or url open_str = allocate_string("open") c_proc(ShellExecute, {0, open_str, html_file, 0, 0, SW_SHOWNORMAL}) end if return -- we've been told to quit end if task_yield() end while end if if match(ustring, upper(line)) then hits += 1 end if line_count += 1 if hits then text_color(BRIGHT_GREEN) else text_color(BRIGHT_BLUE) end if position(task_self()+1, 1) printf(1, "task %2.0f: %-35s matched %d lines out of %d ", {task_self(), url, hits, line_count}) text_color(WHITE) progress = 1 -- this yield is not necessary, but it -- lets you see the parallelism better task_schedule(task_self(), 1) task_yield() end while close(f) end procedure integer t for i = 1 to length(URLs) do t = task_create(routine_id("task_search_url"), {URLs[i], search_phrase}) task_schedule(t, 1) end for system(del_cmd & " newstemp*.html > " & null_device, 2) clear_screen() puts(1, "Looking for \"" & search_phrase & "\"\n") atom time_out time_out = time() + 45 task_schedule(0, {2.5, 3.0}) -- check the time every 2.5 to 3.0 seconds quit = 0 while 1 do progress = 0 task_yield() if progress then -- quit 10 seconds after no more lines are read -- from any file by any task time_out = time() + 10 else position(length(URLs)+3, 1) printf(1, "time remaining: %d seconds ", time_out - time()) if time() > time_out then exit end if end if end while quit = 1 -- signal all tasks to report any final results and terminate while length(task_list()) > 1 do task_yield() end while position(length(URLs)+4, 1) puts(1, "Press Enter to quit ...\n") if getc(0) then end if system(del_cmd & " newstemp*.html > " & null_device, 2) system(del_cmd & " modnewstemp*.html > " & null_device, 2) system(del_cmd & " wget-log.* > " & null_device, 2)