Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize all unicode identifiers to NFC #5462

Merged
merged 2 commits into from
Jan 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Make.inc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ USE_SYSTEM_ZLIB=0
USE_SYSTEM_GRISU=0
USE_SYSTEM_RMATH=0
USE_SYSTEM_LIBUV=0
USE_SYSTEM_UTF8PROC=0

USE_MKL = 0

Expand Down Expand Up @@ -352,6 +353,12 @@ endif
LIBUV_INC = $(JULIAHOME)/deps/libuv/include
endif

ifeq ($(USE_SYSTEM_UTF8PROC), 1)
LIBUTF8PROC = /usr/lib/libutf8proc.a
else
LIBUTF8PROC = $(BUILD)/$(JL_LIBDIR)/libutf8proc.a
endif

# OS specific stuff

# install_name_tool
Expand Down
1 change: 1 addition & 0 deletions deps/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@
/root
/SuiteSparse-*
/zlib-*
/utf8proc-*
41 changes: 39 additions & 2 deletions deps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ endif
CONFIGURE_COMMON += F77="$(FC)" CC="$(CC)" CXX="$(CXX)"

#autoconf configure-driven scripts: llvm readline pcre arpack fftw unwind gmp mpfr patchelf uv
#custom Makefile rules: openlibm Rmath double-conversion random suitesparse-wrapper suitesparse lapack openblas
#custom Makefile rules: openlibm Rmath double-conversion random suitesparse-wrapper suitesparse lapack openblas utf8proc

# prevent installing libs into usr/lib64 on opensuse
unexport CONFIG_SITE
Expand Down Expand Up @@ -115,6 +115,10 @@ ifeq ($(USE_SYSTEM_SUITESPARSE), 0)
STAGE2_DEPS += suitesparse
endif

ifeq ($(USE_SYSTEM_UTF8PROC), 0)
STAGE2_DEPS += utf8proc
endif

# Only compile standalone LAPACK if we are not using OpenBLAS.
# OpenBLAS otherwise compiles LAPACK as part of its build.
# This is useful where one wants to use the vendor BLAS, but
Expand Down Expand Up @@ -144,7 +148,7 @@ install: $(addprefix install-, $(LIBS))
cleanall: $(addprefix clean-, $(LIBS))
distclean: $(addprefix distclean-, $(LIBS))
rm -rf $(BUILD)
getall: get-llvm get-readline get-uv get-pcre get-double-conversion get-openlibm get-openspecfun get-random get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-zlib get-patchelf
getall: get-llvm get-readline get-uv get-pcre get-double-conversion get-openlibm get-openspecfun get-random get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-zlib get-patchelf get-utf8proc

## PATHS ##
DIRS = $(addprefix $(BUILD)/,lib include bin share etc)
Expand Down Expand Up @@ -1225,6 +1229,39 @@ compile-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)
check-fftw-double: fftw-$(FFTW_VER)-double/checked
install-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)

## UTF8PROC ##

UTF8PROC_OBJ_SOURCE = utf8proc-v$(UTF8PROC_VER)/libutf8proc.a
UTF8PROC_OBJ_LIB = $(BUILD)/$(JL_LIBDIR)/libutf8proc.a
UTF8PROC_OBJ_HEADER = $(BUILD)/include/utf8proc.h
UTF8PROC_OBJ_TARGET = $(UTF8PROC_OBJ_LIB) $(UTF8PROC_OBJ_HEADER)

utf8proc-v$(UTF8PROC_VER).tar.gz:
$(JLDOWNLOAD) $@ http://www.public-software-group.org/pub/projects/utf8proc/v$(UTF8PROC_VER)/$@

utf8proc-v$(UTF8PROC_VER)/Makefile: utf8proc-v$(UTF8PROC_VER).tar.gz
tar -xzf $<
touch -c $@

$(UTF8PROC_OBJ_SOURCE): utf8proc-v$(UTF8PROC_VER)/Makefile
$(MAKE) -C utf8proc-v$(UTF8PROC_VER) cc="$(CC) -O2 -std=c99 $(fPIC)" libutf8proc.a

$(UTF8PROC_OBJ_LIB): $(UTF8PROC_OBJ_SOURCE)
cp -f $< $@

$(UTF8PROC_OBJ_HEADER): utf8proc-v$(UTF8PROC_VER)/Makefile
cp -f utf8proc-v$(UTF8PROC_VER)/utf8proc.h $@

clean-utf8proc:
-$(MAKE) -C utf8proc-v$(UTF8PROC_VER) clean
distclean-utf8proc: clean-utf8proc
-rm -rf utf8proc-v$(UTF8PROC_VER).tar.gz utf8proc-v$(UTF8PROC_VER)

get-utf8proc: utf8proc-v$(UTF8PROC_VER).tar.gz
configure-utf8proc: get-utf8proc
compile-utf8proc: $(UTF8PROC_OBJ_SOURCE)
check-utf8proc:
install-utf8proc: $(UTF8PROC_OBJ_TARGET)

## SUITESPARSE ##

Expand Down
1 change: 1 addition & 0 deletions deps/Versions.make
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ MPFR_VER=3.1.2
ZLIB_VER = 1.2.8
PATCHELF_VER = 0.6
GIT_VER = 1.8.2.3
UTF8PROC_VER = 1.1.6
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ifeq ($(USE_LLVM_SHLIB),1)
LLVMLINK = -lLLVM-$(LLVM_VER)
endif

LIBS = $(WHOLE_ARCHIVE) $(JULIAHOME)/src/flisp/libflisp.a $(WHOLE_ARCHIVE) $(JULIAHOME)/src/support/libsupport.a -L$(BUILD)/lib $(LIBUV) $(NO_WHOLE_ARCHIVE) $(call exec,$(LLVM_CONFIG) --ldflags) $(LLVMLINK) $(OSLIBS)
LIBS = $(WHOLE_ARCHIVE) $(JULIAHOME)/src/flisp/libflisp.a $(WHOLE_ARCHIVE) $(JULIAHOME)/src/support/libsupport.a -L$(BUILD)/lib $(LIBUV) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(call exec,$(LLVM_CONFIG) --ldflags) $(LLVMLINK) $(OSLIBS)

ifneq ($(MAKECMDGOALS),debug)
TARGET =
Expand Down
5 changes: 3 additions & 2 deletions src/flisp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ SRCS = flisp.c builtins.c string.c equalhash.c table.c iostream.c \
OBJS = $(SRCS:%.c=%.o)
DOBJS = $(SRCS:%.c=%.do)
LLTDIR = ../support
LLT = $(LLTDIR)/libsupport.a $(LIBUV)
LLT = $(LLTDIR)/libsupport.a $(LIBUV) $(LIBUTF8PROC)

FLAGS = -Wall -Wno-strict-aliasing -I$(LLTDIR) $(CFLAGS) \
-DUSE_COMPUTED_GOTO $(HFILEDIRS:%=-I%) -I$(LIBUV_INC) $(LIBDIRS:%=-L%) \
-DUSE_COMPUTED_GOTO $(HFILEDIRS:%=-I%) \
-I$(LIBUV_INC) -I$(JULIAHOME)/usr/include $(LIBDIRS:%=-L%) \
-fvisibility=hidden -DLIBRARY_EXPORTS
LIBFILES = $(LLT)
LIBS = $(LIBFILES)
Expand Down
42 changes: 37 additions & 5 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ char * dirname(char *);
#include "flisp.h"
#include "opcodes.h"

#include "utf8proc.h"

static char *builtin_names[] =
{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
Expand Down Expand Up @@ -253,12 +255,40 @@ SAFECAST_OP(string,char*, cvalue_data)

symbol_t *symtab = NULL;

int fl_is_keyword_name(char *str, size_t len)
int fl_is_keyword_name(const char *str, size_t len)
{
return len>1 && ((str[0] == ':' || str[len-1] == ':') && str[1] != '\0');
}

static symbol_t *mk_symbol(char *str)
// return NFC-normalized UTF8-encoded version of s
static const char *normalize(char *s)
{
static size_t buflen = 0;
static void *buf = NULL; // persistent buffer (avoid repeated malloc/free)
// options equivalent to utf8proc_NFC:
const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE;
ssize_t result;
size_t newlen;
result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, options);
if (result < 0) goto error;
newlen = result * sizeof(int32_t) + 1;
if (newlen > buflen) {
buflen = newlen * 2;
buf = realloc(buf, buflen);
if (!buf) lerror(MemoryError, "error allocating UTF8 buffer");
}
result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)buf,result, options);
if (result < 0) goto error;
result = utf8proc_reencode((int32_t*)buf,result, options);
if (result < 0) goto error;
return (char*) buf;
error:
lerrorf(ParseError, "error normalizing identifier %s: %s", s,
utf8proc_errmsg(result));
}

// note: assumes str is normalized
static symbol_t *mk_symbol(const char *str)
{
symbol_t *sym;
size_t len = strlen(str);
Expand All @@ -282,7 +312,8 @@ static symbol_t *mk_symbol(char *str)
return sym;
}

static symbol_t **symtab_lookup(symbol_t **ptree, char *str)
// note: assumes str is normalized
static symbol_t **symtab_lookup(symbol_t **ptree, const char *str)
{
int x;

Expand All @@ -301,10 +332,11 @@ static symbol_t **symtab_lookup(symbol_t **ptree, char *str)
value_t symbol(char *str)
{
symbol_t **pnode;
const char *nstr = normalize(str);

pnode = symtab_lookup(&symtab, str);
pnode = symtab_lookup(&symtab, nstr);
if (*pnode == NULL)
*pnode = mk_symbol(str);
*pnode = mk_symbol(nstr);
return tagptr(*pnode, TAG_SYM);
}

Expand Down
2 changes: 1 addition & 1 deletion src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ value_t fl_list2(value_t a, value_t b);
value_t fl_listn(size_t n, ...);
value_t symbol(char *str);
char *symbol_name(value_t v);
int fl_is_keyword_name(char *str, size_t len);
int fl_is_keyword_name(const char *str, size_t len);
value_t alloc_vector(size_t n, int init);
size_t llength(value_t v);
value_t fl_compare(value_t a, value_t b); // -1, 0, or 1
Expand Down