Wikix
Wikix is a 'C' based program written by Jeffrey Vernon Merkey that will read any XML dump provided by the foundation, extract all image names from the XML dump which it may reference, then generate a series of BASH or Bourne Unix style scripts which can be invoked to download all images from Wikimedia Commons and Wikipedia.
The program relies on cURL, an automated web spider, to download referenced images. The program will also convert text based utf8 characters into actual utf8 strings for those dumps which may contain improperly formatted names for specific images. The program can be configured to generate 16 parallel scripts which will download all images from Wikipedia. The program includes Jeff Bezanson's utf8 libraries.
As of March 24, 2008, using a cable modem, the entire set of Wikipedia images can be downloaded in about 96 hours using this program (420 GB as of 3/24/08).
Compiling and Installation
editFirst download the source code from the Source Code section.
On Ubuntu
edit- Extract the contents of wikix.tar.gz. Suppose the source code is extracted into /home/you/wikix.
- Start your terminal program. e.g. Konsole (in KDE)
- You need to install some packages before you compile Wikix. Type in your terminal:-
sudo aptitude install libssl-dev build-essential curl
- Now goto the directory that contains the extracted source code, e.g. /home/you/wikix, by typing
cd /home/you/wikix
- Now type in your terminal.
sudo make
Now if the compilation and linking of Wikix completes without errors then you will have a brand new executable - wikix, in your /home/you/wikix (in this example) directory.
In case of any problems, please report it in the discussion page.
Options
edit# ./wikix -h USAGE: wikix -htrciop < file.xml [ > script.out ] -h this help screen -t use xml dump to strip from tree -r wikipedia path -c commons path -i image path -o output path -p parallel (16 process) mode
Example
editThe program would typically be invoked in a directory that you wish to use to host the images. Wikix will construct a MediaWiki style directory structure which can be quickly imported into a MediaWiki Wikipedia installation (e.g., via php rebuildImages.php --missing
):
wikix -p < name_of_xml_file.xml > script.out &
That is, the xml file is fed via stdin.
The -p option tells wikix to create parallel scripts. If you omit the -p option, it will create one very large file. By default, the program is set up to mirror the English Wikipedia. You can override the default settings by substituting path information for commons and the target Wikipedia site through the command line options.
The program will create a series of scripts as:
image_sh image00 image01 image02 image03 image04 image05 image06 image07 image08 image09 image10 image11 image12 image13 image14 image15
To start the download, simply type
$./image_sh
In case you get the following error.
./image_sh: <line no.>: Syntax error: Bad fd number
Then please open the image_sh
all the imagexx
files and change the topmost line from
#!/bin/sh
to
#!/bin/bash
then re-type
$./image_sh
Source Code
editThe full source code with build scripts can be download from those web servers:
- http://wikix.ngen-cast.de/wikix.tar.gz (July 2013 check: it's broken)
- https://github.com/mattrude/wikix (June 2014: works)
Source Code Files
editMakefile
#CFLAGS = -g
#CFLAGS_LIB = -g -c
CFLAGS = -Wno-pointer-sign -g
CFLAGS_LIB = -Wno-pointer-sign -g -c
CC = gcc
LD = ld
AR = ar
all: wikix
libcutf8.so: utf8.o
$(LD) -shared -lc -o libcutf8.so utf8.o /usr/lib/libc.a
libcutf8.a: utf8.o
$(AR) r libcutf8.a utf8.o
wikix: wikix.c libcutf8.a
$(CC) $(CFLAGS) wikix.c -o wikix libcutf8.a -lssl
clean:
rm -f *.o wikix
install: all
install -m 755 wikix /usr/bin
install -m 644 libcutf8.so /usr/lib
install -m 644 libcutf8.a /usr/lib
platform.h
#define LINUX
utf8.h
#include <stdarg.h>
/* is c the start of a utf8 sequence? */
#define isutf(c) (((c)&0xC0)!=0x80)
/* convert UTF-8 data to wide character */
int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz);
/* the opposite conversion */
int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz);
/* single character to UTF-8 */
int u8_wc_toutf8(char *dest, u_int32_t ch);
/* character number to byte offset */
int u8_offset(char *str, int charnum);
/* byte offset to character number */
int u8_charnum(char *s, int offset);
/* return next character, updating an index variable */
u_int32_t u8_nextchar(char *s, int *i);
/* move to next character */
void u8_inc(char *s, int *i);
/* move to previous character */
void u8_dec(char *s, int *i);
/* returns length of next utf-8 sequence */
int u8_seqlen(char *s);
/* assuming src points to the character after a backslash, read an
escape sequence, storing the result in dest and returning the number of
input characters processed */
int u8_read_escape_sequence(char *src, u_int32_t *dest);
/* given a wide character, convert it to an ASCII escape sequence stored in
buf, where buf is "sz" bytes. returns the number of characters output. */
int u8_escape_wchar(char *buf, int sz, u_int32_t ch);
/* convert a string "src" containing escape sequences to UTF-8 */
int u8_unescape(char *buf, int sz, char *src);
/* convert UTF-8 "src" to ASCII with escape sequences.
if escape_quotes is nonzero, quote characters will be preceded by
backslashes as well. */
int u8_escape(char *buf, int sz, char *src, int escape_quotes);
/* utility predicates used by the above */
int octal_digit(char c);
int hex_digit(char c);
/* return a pointer to the first occurrence of ch in s, or NULL if not
found. character index of found character returned in *charn. */
char *u8_strchr(char *s, u_int32_t ch, int *charn);
/* same as the above, but searches a buffer of a given size instead of
a NUL-terminated string. */
char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn);
/* count the number of characters in a UTF-8 string */
int u8_strlen(char *s);
int u8_is_locale_utf8(char *locale);
/* printf where the format string and arguments may be in UTF-8.
you can avoid this function and just use ordinary printf() if the current
locale is UTF-8. */
int u8_vprintf(char *fmt, va_list ap);
int u8_printf(char *fmt, ...);
utf8.c public domain
/*
Basic UTF-8 manipulation routines
by Jeff Bezanson
placed in the public domain Fall 2005
This code is designed to provide the utilities you need to manipulate
UTF-8 as an internal string encoding. These functions do not perform the
error checking normally needed when handling UTF-8 data, so if you happen
to be from the Unicode Consortium you will want to flay me alive.
I do this because error checking can be performed at the boundaries (I/O),
with these routines reserved for higher performance on data known to be
valid.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#ifdef WIN32
#include <malloc.h>
#else
#include <alloca.h>
#endif
#include "utf8.h"
static const u_int32_t offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
/* returns length of next utf-8 sequence */
int u8_seqlen(char *s)
{
return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
}
/* conversions without error checking
only works for valid UTF-8, i.e. no 5- or 6-byte sequences
srcsz = source size in bytes, or -1 if 0-terminated
sz = dest size in # of wide characters
returns # characters converted
dest will always be L'\0'-terminated, even if there isn't enough room
for all the characters.
if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
*/
int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
{
u_int32_t ch;
char *src_end = src + srcsz;
int nb;
int i=0;
while (i < sz-1) {
nb = trailingBytesForUTF8[(unsigned char)*src];
if (srcsz == -1) {
if (*src == 0)
goto done_toucs;
}
else {
if (src + nb >= src_end)
goto done_toucs;
}
ch = 0;
switch (nb) {
/* these fall through deliberately */
case 3: ch += (unsigned char)*src++; ch <<= 6;
case 2: ch += (unsigned char)*src++; ch <<= 6;
case 1: ch += (unsigned char)*src++; ch <<= 6;
case 0: ch += (unsigned char)*src++;
}
ch -= offsetsFromUTF8[nb];
dest[i++] = ch;
}
done_toucs:
dest[i] = 0;
return i;
}
/* srcsz = number of source characters, or -1 if 0-terminated
sz = size of dest buffer in bytes
returns # characters converted
dest will only be '\0'-terminated if there is enough space. this is
for consistency; imagine there are 2 bytes of space left, but the next
character requires 3 bytes. in this case we could NUL-terminate, but in
general we can't when there's insufficient space. therefore this function
only NUL-terminates if all the characters fit, and there's space for
the NUL as well.
the destination string will never be bigger than the source string.
*/
int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
{
u_int32_t ch;
int i = 0;
char *dest_end = dest + sz;
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
ch = src[i];
if (ch < 0x80) {
if (dest >= dest_end)
return i;
*dest++ = (char)ch;
}
else if (ch < 0x800) {
if (dest >= dest_end-1)
return i;
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000) {
if (dest >= dest_end-2)
return i;
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000) {
if (dest >= dest_end-3)
return i;
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
i++;
}
if (dest < dest_end)
*dest = '\0';
return i;
}
int u8_wc_toutf8(char *dest, u_int32_t ch)
{
if (ch < 0x80) {
dest[0] = (char)ch;
return 1;
}
if (ch < 0x800) {
dest[0] = (ch>>6) | 0xC0;
dest[1] = (ch & 0x3F) | 0x80;
return 2;
}
if (ch < 0x10000) {
dest[0] = (ch>>12) | 0xE0;
dest[1] = ((ch>>6) & 0x3F) | 0x80;
dest[2] = (ch & 0x3F) | 0x80;
return 3;
}
if (ch < 0x110000) {
dest[0] = (ch>>18) | 0xF0;
dest[1] = ((ch>>12) & 0x3F) | 0x80;
dest[2] = ((ch>>6) & 0x3F) | 0x80;
dest[3] = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
/* charnum => byte offset */
int u8_offset(char *str, int charnum)
{
int offs=0;
while (charnum > 0 && str[offs]) {
(void)(isutf(str[++offs]) || isutf(str[++offs]) ||
isutf(str[++offs]) || ++offs);
charnum--;
}
return offs;
}
/* byte offset => charnum */
int u8_charnum(char *s, int offset)
{
int charnum = 0, offs=0;
while (offs < offset && s[offs]) {
(void)(isutf(s[++offs]) || isutf(s[++offs]) ||
isutf(s[++offs]) || ++offs);
charnum++;
}
return charnum;
}
/* number of characters */
int u8_strlen(char *s)
{
int count = 0;
int i = 0;
while (u8_nextchar(s, &i) != 0)
count++;
return count;
}
/* reads the next utf-8 sequence out of a string, updating an index */
u_int32_t u8_nextchar(char *s, int *i)
{
u_int32_t ch = 0;
int sz = 0;
do {
ch <<= 6;
ch += (unsigned char)s[(*i)++];
sz++;
} while (s[*i] && !isutf(s[*i]));
ch -= offsetsFromUTF8[sz-1];
return ch;
}
void u8_inc(char *s, int *i)
{
(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
isutf(s[++(*i)]) || ++(*i));
}
void u8_dec(char *s, int *i)
{
(void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
isutf(s[--(*i)]) || --(*i));
}
int octal_digit(char c)
{
return (c >= '0' && c <= '7');
}
int hex_digit(char c)
{
return ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f'));
}
/* assumes that src points to the character after a backslash
returns number of input characters processed */
int u8_read_escape_sequence(char *str, u_int32_t *dest)
{
u_int32_t ch;
char digs[9]="\0\0\0\0\0\0\0\0\0";
int dno=0, i=1;
ch = (u_int32_t)str[0]; /* take literal character */
if (str[0] == 'n')
ch = L'\n';
else if (str[0] == 't')
ch = L'\t';
else if (str[0] == 'r')
ch = L'\r';
else if (str[0] == 'b')
ch = L'\b';
else if (str[0] == 'f')
ch = L'\f';
else if (str[0] == 'v')
ch = L'\v';
else if (str[0] == 'a')
ch = L'\a';
else if (octal_digit(str[0])) {
i = 0;
do {
digs[dno++] = str[i++];
} while (octal_digit(str[i]) && dno < 3);
ch = strtol(digs, NULL, 8);
}
else if (str[0] == 'x') {
while (hex_digit(str[i]) && dno < 2) {
digs[dno++] = str[i++];
}
if (dno > 0)
ch = strtol(digs, NULL, 16);
}
else if (str[0] == 'u') {
while (hex_digit(str[i]) && dno < 4) {
digs[dno++] = str[i++];
}
if (dno > 0)
ch = strtol(digs, NULL, 16);
}
else if (str[0] == 'U') {
while (hex_digit(str[i]) && dno < 8) {
digs[dno++] = str[i++];
}
if (dno > 0)
ch = strtol(digs, NULL, 16);
}
*dest = ch;
return i;
}
/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
example: u8_unescape(mybuf, 256, "hello\\u220e")
note the double backslash is needed if called on a C string literal */
int u8_unescape(char *buf, int sz, char *src)
{
int c=0, amt;
u_int32_t ch;
char temp[4];
while (*src && c < sz) {
if (*src == '\\') {
src++;
amt = u8_read_escape_sequence(src, &ch);
}
else {
ch = (u_int32_t)*src;
amt = 1;
}
src += amt;
amt = u8_wc_toutf8(temp, ch);
if (amt > sz-c)
break;
memcpy(&buf[c], temp, amt);
c += amt;
}
if (c < sz)
buf[c] = '\0';
return c;
}
int u8_escape_wchar(char *buf, int sz, u_int32_t ch)
{
if (ch == L'\n')
return snprintf(buf, sz, "\\n");
else if (ch == L'\t')
return snprintf(buf, sz, "\\t");
else if (ch == L'\r')
return snprintf(buf, sz, "\\r");
else if (ch == L'\b')
return snprintf(buf, sz, "\\b");
else if (ch == L'\f')
return snprintf(buf, sz, "\\f");
else if (ch == L'\v')
return snprintf(buf, sz, "\\v");
else if (ch == L'\a')
return snprintf(buf, sz, "\\a");
else if (ch == L'\\')
return snprintf(buf, sz, "\\\\");
else if (ch < 32 || ch == 0x7f)
return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
else if (ch > 0xFFFF)
return snprintf(buf, sz, "\\U%.8X", (u_int32_t)ch);
else if (ch >= 0x80 && ch <= 0xFFFF)
return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);
return snprintf(buf, sz, "%c", (char)ch);
}
int u8_escape(char *buf, int sz, char *src, int escape_quotes)
{
int c=0, i=0, amt;
while (src[i] && c < sz) {
if (escape_quotes && src[i] == '"') {
amt = snprintf(buf, sz - c, "\\\"");
i++;
}
else {
amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
}
c += amt;
buf += amt;
}
if (c < sz)
*buf = '\0';
return c;
}
char *u8_strchr(char *s, u_int32_t ch, int *charn)
{
int i = 0, lasti=0;
u_int32_t c;
*charn = 0;
while (s[i]) {
c = u8_nextchar(s, &i);
if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}
char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn)
{
int i = 0, lasti=0;
u_int32_t c;
int csz;
*charn = 0;
while (i < sz) {
c = csz = 0;
do {
c <<= 6;
c += (unsigned char)s[i++];
csz++;
} while (i < sz && !isutf(s[i]));
c -= offsetsFromUTF8[csz-1];
if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}
int u8_is_locale_utf8(char *locale)
{
/* this code based on libutf8 */
const char* cp = locale;
for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
if (*cp == '.') {
const char* encoding = ++cp;
for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
;
if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
|| (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
return 1; /* it's UTF-8 */
break;
}
}
return 0;
}
int u8_vprintf(char *fmt, va_list ap)
{
int cnt, sz=0;
char *buf;
u_int32_t *wcs;
sz = 512;
buf = (char*)alloca(sz);
try_print:
cnt = vsnprintf(buf, sz, fmt, ap);
if (cnt >= sz) {
buf = (char*)alloca(cnt - sz + 1);
sz = cnt + 1;
goto try_print;
}
wcs = (u_int32_t*)alloca((cnt+1) * sizeof(u_int32_t));
cnt = u8_toucs(wcs, cnt+1, buf, cnt);
printf("%ls", (wchar_t*)wcs);
return cnt;
}
int u8_printf(char *fmt, ...)
{
int cnt;
va_list args;
va_start(args, fmt);
cnt = u8_vprintf(fmt, args);
va_end(args);
return cnt;
}
wikix.c
#include "platform.h"
#ifdef WINDOWS
#define strncasecmp strnicmp
#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"
#endif
#ifdef LINUX
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
//#include <ncurses.h>
#include <termios.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <ctype.h>
#include <openssl/md5.h>
#endif
#define NAME_HASH_SIZE 8192
typedef struct _hash
{
struct _hash *next;
struct _hash *prior;
unsigned long len;
char *text;
} hash;
typedef struct _hash_list {
hash *head;
hash *tail;
} hash_list;
unsigned char buffer[8192 * 4];
unsigned char ImagePath[512];
unsigned char OutputPath[512];
unsigned char iPath[512];
unsigned char cPath[512];
unsigned char md5_out[1024];
unsigned char md5_ulout[1024];
unsigned char wk[8192];
unsigned char final1[4096];
unsigned char final2[4096];
unsigned char ulwk[4096];
unsigned char fwk[4096];
unsigned char expand[4096];
unsigned char html[4096];
FILE *fpl[16];
int pmode = 0, tree = 0;
hash_list *learn_list_head = NULL;
hash_list *name_list_head = NULL;
int lobj = 0;
FILE *imagelog = NULL, *imagereject = NULL, *fragmentlog = NULL;
unsigned long shash(char *v, unsigned long len, unsigned long M)
{
register unsigned long h = 0, a = 127, i;
for (i = 0; i < len && *v; v++, i++)
h = ((a * h) + tolower(*v)) % M;
return h;
}
unsigned long add_to_hash(hash_list *top, hash *name)
{
register unsigned long Value;
register hash_list *HashTable;
Value = shash(name->text, name->len, NAME_HASH_SIZE);
if (Value == (unsigned long) -1)
return -1;
HashTable = (hash_list *) top;
if (HashTable)
{
if (!HashTable[Value].head)
{
HashTable[Value].head = name;
HashTable[Value].tail = name;
name->next = name->prior = 0;
}
else
{
HashTable[Value].tail->next = name;
name->next = 0;
name->prior = HashTable[Value].tail;
HashTable[Value].tail = name;
}
return 0;
}
return -1;
}
unsigned long remove_from_hash(hash_list *top, hash *name)
{
register unsigned long Value;
register hash_list *HashTable;
Value = shash(name->text, name->len, NAME_HASH_SIZE);
if (Value == (unsigned long) -1)
return -1;
HashTable = (hash_list *) top;
if (HashTable)
{
if (HashTable[Value].head == name)
{
HashTable[Value].head = name->next;
if (HashTable[Value].head)
HashTable[Value].head->prior = NULL;
else
HashTable[Value].tail = NULL;
}
else
{
name->prior->next = name->next;
if (name != HashTable[Value].tail)
name->next->prior = name->prior;
else
HashTable[Value].tail = name->prior;
}
if (lobj)
lobj--;
return 0;
}
return -1;
}
void free_hash(void)
{
register int i;
register hash_list *HashTable;
register hash *tmp, *name;
if (learn_list_head)
{
HashTable = (hash_list *) learn_list_head;
for (i=0; i < NAME_HASH_SIZE; i++)
{
name = HashTable[i].head;
HashTable[i].head = HashTable[i].tail = 0;
while (name)
{
tmp = name;
name = name->next;
free((void *)tmp);
}
}
free(learn_list_head);
learn_list_head = NULL;
}
if (name_list_head)
{
HashTable = (hash_list *) name_list_head;
for (i=0; i < NAME_HASH_SIZE; i++)
{
name = HashTable[i].head;
HashTable[i].head = HashTable[i].tail = 0;
while (name)
{
tmp = name;
name = name->next;
free((void *)tmp);
}
}
free(name_list_head);
name_list_head = NULL;
}
}
hash_list *init_hash_list(void)
{
learn_list_head = (hash_list *) malloc(sizeof(hash_list) * NAME_HASH_SIZE);
if (!learn_list_head)
return NULL;
memset(learn_list_head, 0, sizeof(hash_list) * NAME_HASH_SIZE);
name_list_head = (hash_list *) malloc(sizeof(hash_list) * NAME_HASH_SIZE);
if (!name_list_head)
return NULL;
memset(name_list_head, 0, sizeof(hash_list) * NAME_HASH_SIZE);
return learn_list_head;
}
hash *search_name_hash(hash_list *top, char *text, unsigned long len)
{
register unsigned long Value;
register hash *name;
register hash_list *HashTable;
Value = shash(text, len, NAME_HASH_SIZE);
HashTable = (hash_list *) top;
name = (hash *) HashTable[Value].head;
while (name)
{
if (len == name->len)
{
if (!strncasecmp(name->text, text, len))
return (hash *) name;
}
name = name->next;
}
return NULL;
}
int learn(char *s, int len)
{
register hash *name;
name = search_name_hash(learn_list_head, s, len);
if (name)
return 1;
name = malloc(sizeof(hash) + len + 2);
if (!name)
return 1;
memset(name, 0, sizeof(hash) + len);
name->text = (char *)((unsigned long)name + sizeof(hash));
name->len = len;
strncpy(name->text, s, len);
if (add_to_hash(learn_list_head, name) == -1)
{
free(name);
return 1;
}
lobj++;
return 0;
}
int imagename(char *s, int len)
{
register hash *name;
name = search_name_hash(name_list_head, s, len);
if (name)
return 1;
name = malloc(sizeof(hash) + len + 2);
if (!name)
return 1;
memset(name, 0, sizeof(hash) + len);
name->text = (char *)((unsigned long)name + sizeof(hash));
name->len = len;
strncpy(name->text, s, len);
if (add_to_hash(name_list_head, name) == -1)
{
free(name);
return 1;
}
lobj++;
return 0;
}
unsigned char *nprintf(char *s, int len, FILE *fp)
{
register int i;
if (!s || !*s)
return s;
for (i=0; *s && (i < len); i++)
putc(*s++, fp);
return s;
}
unsigned char *str8rchr(const char * s, int c1, int c2, int c3, int c4,
int c5, int c6, int c7, int c8)
{
const char *p = s + strlen(s);
do {
if ((*p == (char)c1) || (*p == (char)c2) || (*p == (char)c3) ||
(*p == (char)c4) || (*p == (char)c5) || (*p == (char)c6) ||
(*p == (char)c7) || (*p == (char)c8))
return (char *)p;
} while (--p >= s);
return NULL;
}
unsigned char *str5rchr(const char * s, int c1, int c2, int c3, int c4,
int c5)
{
const char *p = s + strlen(s);
do {
if ((*p == (char)c1) || (*p == (char)c2) || (*p == (char)c3) ||
(*p == (char)c4) || (*p == (char)c5))
return (char *)p;
} while (--p >= s);
return NULL;
}
char *strnstr(const char * s1,const char * s2)
{
int l1, l2;
l2 = strlen(s2);
if (!l2)
return (char *) s1;
l1 = strlen(s1);
while (l1 >= l2) {
l1--;
if (!strncasecmp(s1,s2,l2))
return (char *) s1;
s1++;
}
return NULL;
}
unsigned char *imagetypes[]=
{
// 7
".svg+xml",
".xcf.bz2",
// 6
".bitmap",
".xcfbz2",
// 5
".xcfgz",
".alpha",
".dicom",
".matte",
".xjtgz",
// 4
".mask",
".aifc",
".aiff",
".fits",
".icon",
".im24",
".im32",
".jpeg",
".midi",
".mpeg",
".xwav",
".mpga",
".tiff",
".djvu",
// 3
".aif",
".als",
".apm",
".bmp",
".bz2",
".cel",
".dcm",
".eps",
".fit",
".flc",
".fli",
".gbr",
".gif",
".gih",
".gpb",
".ico",
".im1",
".im8",
".jpe",
".jpg",
".kar",
".mid",
".mov",
".mp2",
".mp3",
".mp4",
".mpa",
".mpg",
".ogg",
".ogm",
".pcc",
".pcx",
".pdf",
".pdm",
".pgm",
".pix",
".png",
".pnm",
".ppm",
".psd",
".psp",
".ras",
".rgb",
".sgi",
".svg",
".swf",
".tga",
".tif",
".tub",
".wav",
".wmf",
".xbm",
".xcf",
".xjt",
".xpm",
".xwd",
".pov",
".wma",
".dia",
".fig",
".jif",
".pgn",
".art",
".djv",
// 2
".bw",
".ps",
".g3",
".js",
".rs",
};
unsigned char *strip_image_info(unsigned char *s, char *title)
{
register int i;
unsigned char *p, *j;
FILE *fp = stdout;
unsigned char ch = '\0';
while (*s && (isspace(*s))) s++;
if (!strncasecmp(s, "no image", 8))
return s;
p = s;
while (*s)
{
if ((!strncasecmp(s, "image", 5) ||
!strncasecmp(s, "map", 3)) && !isalnum(ch))
{
unsigned char *fragment, *end;
fragment = s;
if (!strncasecmp(s, "image", 5))
s += 5;
else
if (!strncasecmp(s, "map", 3))
s += 3;
if (*s)
{
while (*s && isalnum(*s)) s++;
end = s;
while (*s && isspace(*s)) s++;
if (*s && *s == '=' || *s == ':')
{
memset(&fwk[0], 0, 256);
memmove(&fwk[0], fragment, (end - fragment));
if (!learn(&fwk[0], end - fragment))
{
if (*title)
fprintf(fragmentlog, "[%s] %s\n", title, &fwk[0]);
else
fprintf(fragmentlog, "%s\n", &fwk[0]);
fflush(fragmentlog);
}
s++;
s = strip_image_info(s, title);
ch = '\0';
}
}
continue;
}
if ((*s == '|') || (*s == ']') || (*s == '\n'))
{
register int y;
unsigned char ch = '\x22', *l;
unsigned char dir1[32], dir2[32];
unsigned char *lp, *lw, *lo, *delim, *blp;
unsigned char *ulp, *fname, *bulp;
register int cnvt = 0, bcnvt = 0, unicnvt = 0, invl = 0;
lp = &wk[0];
j = lp;
while (*p && (p < s))
{
// skip self referencing images
if (!strncasecmp(p, "{{", 2))
return s;
if (!memcmp(p, """, 6))
{
p += 6;
*j++ = '\x22';
}
if (!memcmp(p, "&", 5))
{
p += 5;
*j++ = '&';
}
if (!memcmp(p, "<", 4))
{
p += 4;
while (*p)
{
if (!memcmp(p, ">", 4))
{
p += 4;
break;
}
p++;
}
}
if (*p == '\n')
p++;
if (!memcmp(p, "[[", 2))
break;
*j++ = *p++;
}
*j = '\0';
s++;
for (j=NULL, y=0; y < (sizeof(imagetypes) / sizeof (char *)); y++)
{
j = strnstr(lp, imagetypes[y]);
if (j)
{
register int ilen = strlen(imagetypes[y]);
j += ilen;
*j = '\0';
break;
}
}
if (!j)
{
if (*lp && isalpha(*lp))
{
unsigned char *sp = strchr(lp, '.');
unsigned char *sj, *slp = lp;
if (sp)
{
unsigned char *sllp = sp, *meter;
sllp++;
if ((*sllp != ' ') && (isalpha(*sllp)))
{
sj = str8rchr(slp, ':', '/', '\\', '{', '\n', '&',
'=', '>');
if (sj)
slp = ++sj;
meter = sllp;
while (*sllp)
{
if (!isalpha(*sllp))
{
*sllp = '\0';
break;
}
sllp++;
}
if (*slp &&
(((sllp - meter) >= 3) && ((sllp - meter) <= 5)))
{
if (*title)
fprintf(imagereject, "[%s] %s\n", title, slp);
else
fprintf(imagereject, "%s\n", slp);
fflush(imagereject);
}
}
}
}
return s;
}
j = str5rchr(lp, ':', '/', '\\', '{', '\n');
if (j)
lp = ++j;
if (!*lp)
return s;
#ifdef UNICODE_EXPANSION
// filename string extracted. convert xml control character tags
l = &expand[0];
ulp = lp;
while (*ulp)
{
if (!strncasecmp(ulp, "&", 5))
{
ulp += 5;
*l++ = '&';
continue;
}
if (!strncasecmp(ulp, "<", 4))
{
ulp += 4;
*l++ = '<';
continue;
}
if (!strncasecmp(ulp, ">", 4))
{
ulp += 4;
*l++ = '>';
continue;
}
if (!strncasecmp(ulp, """, 6))
{
ulp += 6;
*l++ = '\"';
continue;
}
if (!strncasecmp(ulp, "'", 6))
{
ulp += 6;
*l++ = '\'';
continue;
}
if (!strncasecmp(ulp, " ", 6))
{
ulp += 6;
*l++ = ' ';
continue;
}
if (!strncasecmp(ulp, "–", 6))
{
ulp += 6;
*l++ = '-';
continue;
}
if ((ulp[0] == '&') && (ulp[1] != '&'))
{
unsigned char *sc = strchr(ulp, ';'), *slp;
unsigned char unicode[32];
unsigned char unidest[32];
unsigned short uni;
if (sc)
{
slp = ulp;
slp++;
while (*slp != ';')
{
if ((*slp == '#') || (*slp == '-') ||
(*slp == 'x') || (*slp == 'X') ||
isxdigit(*slp))
slp++;
else
{
invl = 1;
break;
}
}
if (!invl)
{
int unilen = sc - ulp;
int slen = sc - ulp;
slp = ulp;
slp++;
unilen--;
if (*slp == '#')
{
unilen--;
slp++;
}
if (unilen < 31)
{
memset(unicode, 0, 32);
strncpy(unicode, slp, unilen);
uni = atoi(unicode);
fprintf(imagelog, "UNI1: %s (#%d) %s \n",
unicode, (int)uni,
lp);
unicode[0] = '\0';
sprintf(unicode, "\\u%04X", uni);
unilen = u8_unescape(l, 32, unicode);
fprintf(imagelog, "UNI2: %s unilen %d slen %d\n",
unicode, (int)unilen, (int)slen);
ulp += slen;
l += unilen;
ulp++;
unicnvt = 1;
continue;
}
}
}
}
*l++ = *ulp++;
}
*l = '\0';
lp = &expand[0];
#endif
// convert spaces to underline characters in image names
ulp = &ulwk[0];
memmove(ulp, lp, strlen(lp) + 1);
ulp[0] = toupper(ulp[0]);
{
l = ulp;
while (*l)
{
if (*l == ' ')
{
*l = '_';
cnvt = 1;
}
l++;
}
}
if (learn(lp, strlen(lp)))
return s;
if (cnvt && learn(ulp, strlen(ulp)))
return s;
memset(md5_out, 0, 16);
lp[0] = toupper(lp[0]);
#ifdef UNICODE_EXPANSION
if (unicnvt || invl)
{
if (invl)
fprintf(imagelog, "INVL: %s -> %s\n", wk, lp);
else
fprintf(imagelog, "%s -> %s\n", wk, lp);
fflush(imagelog);
if (invl)
return s;
}
else
return s;
#else
fprintf(imagelog, "%s\n", lp);
fflush(imagelog);
#endif
MD5(lp, strlen(lp), md5_out);
dir1[0] = '\0';
sprintf(dir1, "%x/%02x/", (md5_out[0] >> 4), md5_out[0]);
if (cnvt)
{
memset(md5_ulout, 0, 16);
ulp[0] = toupper(ulp[0]);
MD5(ulp, strlen(ulp), md5_ulout);
dir2[0] = '\0';
sprintf(dir2, "%x/%02x/", (md5_ulout[0] >> 4), md5_ulout[0]);
}
// add trailing \\ characters to bash control chars
fname = &final1[0];
blp = lp;
while (*blp)
{
if ((*blp == '\"') || (*blp == '\'') || (*blp == '`'))
{
bcnvt = 1;
*fname++ = '\\';
}
else
if ((*blp == ' ') || (*blp == '(') || (*blp == ')') ||
(*blp == '{') || (*blp == '}') || (*blp == '[') ||
(*blp == ']') || (*blp == '&') || (*blp == '-') ||
(*blp == ';'))
*fname++ = '\\';
*fname++ = *blp++;
}
*fname = '\0';
blp = &final1[0];
// add trailing \\ characters to bash control chars
fname = &final2[0];
bulp = ulp;
while (*bulp)
{
if ((*bulp == '\"') || (*bulp == '\'') || (*bulp == '`'))
{
bcnvt = 1;
*fname++ = '\\';
}
else
if ((*bulp == ' ') || (*bulp == '(') || (*bulp == ')') ||
(*bulp == '{') || (*bulp == '}') || (*bulp == '[') ||
(*bulp == ']') || (*bulp == '&') || (*bulp == '-') ||
(*bulp == ';'))
*fname++ = '\\';
*fname++ = *bulp++;
}
*fname = '\0';
bulp = &final2[0];
// debug of control characters
// if (!bcnvt)
// return s;
if (tree)
{
if (pmode)
fp = fpl[(md5_out[0] >> 4) % 16];
fprintf(fp, "if [ -a $IMAGE./%s%s ]; then\n",
dir1, blp);
fprintf(fp, "\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
fprintf(fp, "\tcp -f $IMAGE./%s%s $OUTPUT./%s%s\n",
dir1, blp, dir1, blp);
fprintf(fp, "\techo ./%s%s copied to $OUTPUT./%s%s >> "
"copied.log\n", dir1, blp, dir1, blp);
if (cnvt)
{
fprintf(fp, "elif [ -a $IMAGE./%s%s ]; then\n",
dir2, bulp);
fprintf(fp, "\t/bin/mkdir -p $OUTPUT./%s\n", dir2);
fprintf(fp, "\tcp -f $IMAGE./%s%s $OUTPUT./%s%s\n",
dir2, bulp, dir2, bulp);
fprintf(fp, "\techo ./%s%s copied to $OUTPUT./%s%s >> "
"copied.log\n", dir2, bulp, dir2, bulp);
}
fprintf(fp, "else\n");
fprintf(fp,
"\techo ./%s%s file not found >> failed.log\n", dir1,
blp);
fprintf(fp, "fi\n\n");
}
else
{
if (pmode)
fp = fpl[(md5_out[0] >> 4) % 16];
fprintf(fp, "if [ -a $IMAGE./%s%s ]; then\n",
dir1, blp);
fprintf(fp, "\techo %s%s already exists >> exists.log\n",
dir1, blp);
if (cnvt)
{
fprintf(fp, "elif [ -a $IMAGE./%s%s ]; then\n",
dir2, bulp);
fprintf(fp, "\techo %s%s already exists >> exists.log\n",
dir2, bulp);
}
fprintf(fp, "else\n");
fprintf(fp, "\tcurl --retry 7 -f -O $IMAGEPATH./%s%s\n",
dir1, blp);
fprintf(fp, "\tif [ -a $IMAGE./%s ]; then\n", blp);
fprintf(fp, "\t\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
fprintf(fp, "\t\t/bin/mv ./%s $OUTPUT./%s\n",
blp, dir1);
fprintf(fp, "\t\techo ./%s%s downloaded >> download.log\n",
dir1, blp);
fprintf(fp, "\telse\n");
fprintf(fp, "\t\tcurl --retry 7 -f -O $COMMONSPATH./%s%s\n",
dir1, blp);
fprintf(fp, "\t\tif [ -a $IMAGE./%s ]; then\n",
blp);
fprintf(fp, "\t\t\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
fprintf(fp, "\t\t\t/bin/mv ./%s $OUTPUT./%s\n",
blp, dir1);
fprintf(fp, "\t\t\techo ./%s%s downloaded >> download.log\n",
dir1, blp);
fprintf(fp, "\t\telse\n");
if (cnvt)
{
fprintf(fp, "\t\t\tcurl --retry 7 -f -O $IMAGEPATH./%s%s\n",
dir2, bulp);
fprintf(fp, "\t\t\tif [ -a $IMAGE./%s ]; then\n",
bulp);
fprintf(fp, "\t\t\t\t/bin/mkdir -p $OUTPUT./%s\n",
dir2);
fprintf(fp, "\t\t\t\t/bin/mv ./%s $OUTPUT./%s\n",
bulp, dir2);
fprintf(fp, "\t\t\t\techo ./%s%s downloaded >> "
"download.log\n", dir2, bulp);
fprintf(fp, "\t\t\telse\n");
fprintf(fp, "\t\t\t\tcurl --retry 7 -f -O $COMMONSPATH./%s%s\n",
dir2, bulp);
fprintf(fp, "\t\t\t\tif [ -a $IMAGE./%s ]; then\n",
bulp);
fprintf(fp, "\t\t\t\t\t/bin/mkdir -p $OUTPUT./%s\n",
dir2);
fprintf(fp, "\t\t\t\t\t/bin/mv ./%s $OUTPUT./%s\n",
bulp, dir2);
fprintf(fp, "\t\t\t\t\techo ./%s%s downloaded >> "
"download.log\n", dir2, bulp);
fprintf(fp, "\t\t\t\telse\n");
fprintf(fp, "\t\t\t\t\techo ./%s%s failed >> failed.log\n",
dir1, blp);
fprintf(fp, "\t\t\t\t\techo ./%s%s failed >> failed.log\n",
dir2, bulp);
fprintf(fp, "\t\t\t\tfi\n");
fprintf(fp, "\t\t\tfi\n");
}
else
{
fprintf(fp,
"\t\t\techo ./%s%s failed >> failed.log\n", dir1,
blp);
}
fprintf(fp, "\t\tfi\n");
fprintf(fp, "\tfi\n");
fprintf(fp, "fi\n\n");
}
return s;
}
ch = *s;
s++;
}
return s;
}
int main(int argc, char *argv[])
{
register int i, r, inpage = 0;
unsigned char *s, *j, fname[32], *buffer, *title, *title_p;
FILE *fl;
ImagePath[0] = '\0';
OutputPath[0] = '\0';
// http://upload.wikimedia.org/wikipedia/en/
// http://upload.wikimedia.org/wikipedia/commons/
iPath[0] = '\0';
cPath[0] = '\0';
strcpy(iPath, "http://upload.wikimedia.org/wikipedia/en/");
strcpy(cPath, "http://upload.wikimedia.org/wikipedia/commons/");
for (i=0; i < argc; i++)
{
// remote path
if (!memcmp(argv[i], "-h", 2))
{
printf("USAGE: wikix -htrciop < file.xml [ > script.out ]\n");
printf(" -h this help screen\n");
printf(" -t use xml dump to strip from tree\n");
printf(" -r wikipedia path\n");
printf(" -c commons path\n");
printf(" -i image path\n");
printf(" -o output path\n");
printf(" -p parallel (16 process) mode\n");
exit(1);
}
// remote path
if (!memcmp(argv[i], "-t", 2))
{
tree = 1;
}
// remote path
if (!memcmp(argv[i], "-r", 2))
{
i++;
if (argv[i])
strncpy(iPath, argv[i], 256);
}
// commons
if (!memcmp(argv[i], "-c", 2))
{
i++;
if (argv[i])
strncpy(cPath, argv[i], 256);
}
// image tree
if (!memcmp(argv[i], "-i", 2))
{
i++;
if (argv[i])
strncpy(ImagePath, argv[i], 256);
}
// output image tree
if (!memcmp(argv[i], "-o", 2))
{
i++;
if (argv[i])
strncpy(OutputPath, argv[i], 256);
}
//parallel thread mode (16 processes)
if (!memcmp(argv[i], "-p", 2))
{
pmode = 1;
}
}
memset(&fwk[0], 0xFF, 256);
if (!init_hash_list())
{
printf("wikix: could not allocate workspace\n");
exit(1);
}
buffer = malloc(0x10000);
if (!buffer)
{
printf("gfdl-wikititle: could not allocate buffer workspace\n");
exit(1);
}
buffer[0] = '\0';
title = malloc(0x10000);
if (!title)
{
printf("gfdl-wikititle: could not allocate namespace\n");
exit(1);
}
title[0] = '\0';
if (!pmode)
{
printf("#!/bin/sh\n\n");
printf("IMAGE=%s\n", ImagePath);
printf("OUTPUT=%s\n", OutputPath);
printf("IMAGEPATH=%s\n", iPath);
printf("COMMONSPATH=%s\n\n", cPath);
printf("/bin/mkdir -p $OUTPUT./thumb\n");
printf("/bin/chmod 777 $OUTPUT./thumb\n");
printf("/bin/mkdir -p $OUTPUT./temp\n");
printf("/bin/chmod 777 $OUTPUT./temp\n");
printf("/bin/mkdir -p $OUTPUT./tmp\n");
printf("/bin/chmod 777 $OUTPUT./tmp\n\n");
}
else
{
fl = fopen("image_sh", "w");
if (!fl)
{
printf("FILE error could not create image_sh\n");
exit(1);
}
chmod("image_sh", 0755);
fprintf(fl, "#!/bin/sh\n\n");
fprintf(fl, "IMAGE=%s\n", ImagePath);
fprintf(fl, "OUTPUT=%s\n", OutputPath);
fprintf(fl, "IMAGEPATH=%s\n", iPath);
fprintf(fl, "COMMONSPATH=%s\n\n", cPath);
fprintf(fl, "/bin/mkdir -p $OUTPUT./thumb\n");
fprintf(fl, "/bin/chmod 777 $OUTPUT./thumb\n");
fprintf(fl, "/bin/mkdir -p $OUTPUT./temp\n");
fprintf(fl, "/bin/chmod 777 $OUTPUT./temp\n");
fprintf(fl, "/bin/mkdir -p $OUTPUT./tmp\n");
fprintf(fl, "/bin/chmod 777 $OUTPUT./tmp\n\n");
for (r=0; r < 16; r++)
{
fname[0] = '\0';
sprintf(fname, "image%02d", r);
fpl[r] = fopen(fname, "w");
if (!fpl[r])
{
printf("FILE error could not create [%s]\n", fname);
exit(1);
}
chmod(fname, 0755);
fprintf(fpl[r], "#!/bin/sh\n\n");
fprintf(fpl[r], "\nIMAGE=%s\n", ImagePath);
fprintf(fpl[r], "OUTPUT=%s\n", OutputPath);
fprintf(fpl[r], "IMAGEPATH=%s\n", iPath);
fprintf(fpl[r], "COMMONSPATH=%s\n\n", cPath);
fprintf(fl, "./%s >& imagelog.%02d &\n", fname, r);
}
fclose(fl);
}
imagelog = fopen("image.log", "wb");
if (!imagelog)
{
printf("FILE error could not create image log\n");
}
imagereject = fopen("reject.log", "wb");
if (!imagereject)
{
printf("FILE error could not create reject log\n");
}
fragmentlog = fopen("fragment.log", "wb");
if (!fragmentlog)
{
printf("FILE error could not create image name fragment log\n");
}
while (s = fgets(buffer, 8192 * 4, stdin))
{
unsigned char ch = '\0';
if (strstr(s, "<page>"))
{
inpage++;
if (*title)
*title = '\0';
continue;
}
if (strstr(s, "</page>"))
{
if (inpage)
inpage--;
if (*title)
*title = '\0';
continue;
}
title_p = strstr(s, "<title>");
if (inpage && title_p)
{
register char *ts, *tp;
ts = title_p;
ts += 7;
tp = strstr(ts, "</title>");
if (tp)
{
if (tp - ts)
{
strncpy(title, ts, tp - ts);
title[tp - ts] = '\0';
}
}
}
while (*s)
{
if (inpage && !strncasecmp(s, "<title>", 7))
{
register char *ts, *tp;
s += 7;
ts = s;
tp = strstr(ts, "</title>");
if (tp)
{
if (tp - ts)
{
strncpy(title, ts, tp - ts);
title[tp - ts] = '\0';
}
}
}
if ((!strncasecmp(s, "image", 5) ||
!strncasecmp(s, "map", 3)) &&
!isalnum(ch))
{
unsigned char *fragment, *end;
fragment = s;
if (!strncasecmp(s, "image", 5))
s += 5;
else
if (!strncasecmp(s, "map", 3))
s += 3;
if (*s)
{
while (*s && isalnum(*s)) s++;
end = s;
while (*s && isspace(*s)) s++;
if (*s && (*s == '=' || *s == ':'))
{
memset(&fwk[0], 0, 256);
memmove(&fwk[0], fragment, (end - fragment));
if (!imagename(&fwk[0], end - fragment))
{
if (*title)
fprintf(fragmentlog, "[%s] %s\n", title, &fwk[0]);
else
fprintf(fragmentlog, "%s\n", &fwk[0]);
fflush(fragmentlog);
}
s++;
s = strip_image_info(s, title);
ch = '\0';
}
}
continue;
}
ch = *s;
s++;
}
}
if (pmode)
{
for (r=0; r < 16; r++)
{
if (!fpl[r])
fclose(fpl[r]);
fpl[r] = NULL;
}
}
fclose(fragmentlog);
fclose(imagelog);
fclose(imagereject);
free(title);
free(buffer);
free_hash();
return 0;
}